import fs from 'node:fs';
import { parse } from 'csv-parse';
import { withClient } from '../db/pool';
export async function importCsv(path: string) {
const parser = fs.createReadStream(path).pipe(parse({ columns: true, trim: true }));
const batch: Array<{ email: string; name: string }> = [];
for await (const row of parser) {
if (!row.email) continue;
batch.push({ email: row.email, name: row.name ?? '' });
if (batch.length >= 500) {
await insertBatch(batch.splice(0, batch.length));
}
}
if (batch.length) await insertBatch(batch);
}
async function insertBatch(rows: Array<{ email: string; name: string }>) {
await withClient(async (client) => {
const values = rows
.map((r, i) => `($${i * 2 + 1}, $${i * 2 + 2})`)
.join(',');
const params = rows.flatMap((r) => [r.email, r.name]);
await client.query(
`INSERT INTO users(email, name) VALUES ${values} ON CONFLICT (email) DO UPDATE SET name = EXCLUDED.name`,
params
);
});
}
The first time a CSV import OOM’d a production process, I stopped trusting ‘just read it into memory’. Now I stream the upload to disk (or S3), stream-parse rows, and batch inserts into the DB. The win is that memory usage stays flat, even when the file is huge. I also validate per-row and collect rejected rows into an error report instead of failing the entire import. Backpressure matters here: await your DB writes so the parser doesn’t outrun your database. If performance becomes an issue, I add a small concurrency limit, but I never go fully unbounded. This keeps imports predictable, debuggable, and safe as data sizes grow.