diff --git a/docs/cli/me-wikipedia.md b/docs/cli/me-wikipedia.md new file mode 100644 index 0000000..40d0a8c --- /dev/null +++ b/docs/cli/me-wikipedia.md @@ -0,0 +1,70 @@ +# me wikipedia + +Download and import Wikimedia article dumps as Memory Engine memories. + +## Commands + +- [me wikipedia import](#me-wikipedia-import) -- download and import a Wikipedia XML dump + +--- + +## me wikipedia import + +Download and import a Wikipedia dump. Wikimedia article dumps use the **MediaWiki XML export format**, usually distributed as a **bzip2-compressed** `.xml.bz2` archive such as `enwiki-latest-pages-articles-multistream.xml.bz2`. + +``` +me wikipedia import [source] [options] +``` + +| Argument | Required | Description | +|----------|----------|-------------| +| `source` | no | Wiki slug (`simplewiki`, `enwiki`), dump URL, or local `.xml` / `.xml.bz2` file. Defaults to `simplewiki`. | + +| Option | Description | +|--------|-------------| +| `--wiki ` | Wiki database name when `source` is omitted or a local file (default: `simplewiki`). | +| `--date ` | Dump date for Wikimedia URLs (default: `latest`). | +| `--dump-kind ` | Wikimedia dump kind (default: `pages-articles-multistream`). | +| `--cache-dir ` | Directory for downloaded dump archives. | +| `--force-download` | Redownload even when the cache file exists. | +| `--download-only` | Download the dump archive and exit. | +| `--tree-root ` | Tree root for imported memories (default: `wikipedia`). | +| `--namespace ` | MediaWiki namespace number to import (default: `0`, articles). | +| `--include-redirects` | Import redirect pages. Redirects are skipped by default. | +| `--content-mode ` | Content to store: `plain` or `wikitext` (default: `plain`). | +| `--max-content-bytes ` | Truncate each memory content to this many UTF-8 bytes (`0` disables truncation). | +| `--limit ` | Maximum article memories to process after filters. Useful for samples. | +| `--batch-size ` | Memories to buffer before each `memory.batchCreate` (default: `500`). | +| `--dry-run` | Parse and estimate without writing memories. | +| `--update-existing` | Update existing deterministic Wikipedia memories instead of skipping them. | +| `-v, --verbose` | Show per-batch progress output. | + +### Examples + +```bash +# Cheap validation run against Simple English Wikipedia +me wikipedia import --dry-run --limit 1000 + +# Import Simple English Wikipedia +me wikipedia import simplewiki + +# Import full English Wikipedia +me wikipedia import enwiki + +# Use an already-downloaded archive +me wikipedia import ~/Downloads/enwiki-latest-pages-articles-multistream.xml.bz2 --wiki enwiki + +# Download only +me wikipedia import enwiki --download-only +``` + +### Memory shape + +Each imported article becomes one memory: + +- `content`: `# Title` followed by either cleaned plain text or raw wikitext. +- `tree`: `.`, where `primary_category_slug` is the first category in `meta.categories` normalized for ltree, for example `wikipedia.relational_databases`. Articles without categories use `wikipedia.uncategorized`. +- `temporal`: current revision timestamp from the dump. +- `meta`: source metadata including `source_wiki`, `source_page_id`, `source_revision_id`, `source_title`, `source_url`, `categories`, `primary_category`, `primary_category_slug`, `source_format`, `content_format`, and importer version. + +IDs are deterministic per `(wiki, page_id)`, so re-running the same import skips already-created articles instead of duplicating them. diff --git a/packages/cli/commands/wikipedia.ts b/packages/cli/commands/wikipedia.ts new file mode 100644 index 0000000..770f25a --- /dev/null +++ b/packages/cli/commands/wikipedia.ts @@ -0,0 +1,727 @@ +/** + * me wikipedia — import Wikimedia article dumps as memories. + */ + +import { existsSync } from "node:fs"; +import { homedir } from "node:os"; +import { basename, join, resolve } from "node:path"; +import * as clack from "@clack/prompts"; +import type { MemoryCreateParams } from "@memory.build/protocol/engine"; +import { Command } from "commander"; +import { batchCreateChunked } from "../chunk.ts"; +import { createClient, type EngineClient } from "../client.ts"; +import { resolveCredentials } from "../credentials.ts"; +import { getOutputFormat, type OutputFormat, output } from "../output.ts"; +import { handleError, requireEngine, requireSession } from "../util.ts"; +import { + buildWikipediaDumpUrl, + buildWikipediaMemory, + DEFAULT_WIKIPEDIA_DUMP_DATE, + DEFAULT_WIKIPEDIA_DUMP_KIND, + DEFAULT_WIKIPEDIA_WIKI, + downloadFile, + inferDumpDateFromDumpName, + inferDumpKindFromDumpName, + inferWikiSlugFromDumpName, + openDumpTextStream, + streamMediaWikiPages, + WIKIPEDIA_DUMP_FORMAT, + type WikipediaContentMode, +} from "../wikipedia.ts"; + +const DEFAULT_TREE_ROOT = "wikipedia"; +const DEFAULT_BATCH_SIZE = 500; +const OPENAI_TEXT_EMBEDDING_3_SMALL_USD_PER_MILLION_TOKENS = 0.02; +const VALID_TREE_ROOT_RE = /^[a-z0-9_]+(\.[a-z0-9_]+)*$/; +const VALID_WIKI_SLUG_RE = /^[a-z0-9_]+wiki$/i; + +interface ResolvedWikipediaSource { + wikiSlug: string; + dumpDate?: string; + dumpKind?: string; + sourceUrl?: string; + sourcePath: string; + downloaded: boolean; + bytesDownloaded?: number; + totalBytes?: number; +} + +interface WikipediaImportStats { + dryRun: boolean; + dumpFormat: string; + sourcePath: string; + sourceUrl?: string; + wikiSlug: string; + dumpDate?: string; + dumpKind?: string; + treeRoot: string; + namespace: number; + includeRedirects: boolean; + contentMode: WikipediaContentMode; + pagesScanned: number; + namespaceSkipped: number; + redirectsSkipped: number; + emptyContentSkipped: number; + memoriesPrepared: number; + contentTruncated: number; + imported: number; + updated: number; + skipped: number; + failed: number; + estimatedContentBytes: number; + estimatedEmbeddingTokens: number; + estimatedEmbeddingCostUsd: number; + errors: Array<{ source: string; error: string; itemCount?: number }>; +} + +export function createWikipediaCommand(): Command { + const wikipedia = new Command("wikipedia").description( + "import Wikipedia dumps", + ); + wikipedia.addCommand(createWikipediaImportCommand()); + return wikipedia; +} + +function createWikipediaImportCommand(): Command { + return new Command("import") + .description("download and import a Wikipedia XML dump as memories") + .argument( + "[source]", + "wiki slug (simplewiki/enwiki), dump URL, or local .xml/.xml.bz2 file", + ) + .option( + "--wiki ", + `wiki database name when source is omitted or a local file (default: ${DEFAULT_WIKIPEDIA_WIKI})`, + DEFAULT_WIKIPEDIA_WIKI, + ) + .option( + "--date ", + `dump date for Wikimedia URLs (default: ${DEFAULT_WIKIPEDIA_DUMP_DATE})`, + DEFAULT_WIKIPEDIA_DUMP_DATE, + ) + .option( + "--dump-kind ", + `Wikimedia dump kind (default: ${DEFAULT_WIKIPEDIA_DUMP_KIND})`, + DEFAULT_WIKIPEDIA_DUMP_KIND, + ) + .option( + "--cache-dir ", + "directory for downloaded dump archives", + defaultWikipediaCacheDir(), + ) + .option("--force-download", "redownload even when the cache file exists") + .option("--download-only", "download the dump archive and exit") + .option( + "--tree-root ", + `tree root for imported memories (default: ${DEFAULT_TREE_ROOT})`, + DEFAULT_TREE_ROOT, + ) + .option( + "--namespace ", + "MediaWiki namespace number to import (0 = articles)", + "0", + ) + .option("--include-redirects", "import redirect pages (default: skip)") + .option( + "--content-mode ", + "article content to store: plain or wikitext", + "plain", + ) + .option( + "--max-content-bytes ", + "truncate each memory content to this many UTF-8 bytes (0 disables truncation)", + ) + .option("--limit ", "maximum article memories to process after filters") + .option( + "--batch-size ", + `memories to buffer before each batchCreate (default: ${DEFAULT_BATCH_SIZE})`, + String(DEFAULT_BATCH_SIZE), + ) + .option("--dry-run", "parse and estimate without writing memories") + .option( + "--update-existing", + "update existing deterministic Wikipedia memories instead of skipping them", + ) + .option("-v, --verbose", "show per-batch progress output") + .action(async (source: string | undefined, opts, cmd) => { + const globalOpts = cmd.optsWithGlobals(); + const fmt = getOutputFormat(globalOpts); + const requiresEngine = opts.dryRun !== true && opts.downloadOnly !== true; + + let engine: EngineClient | undefined; + if (requiresEngine) { + const creds = resolveCredentials(globalOpts.server); + requireSession(creds, fmt); + requireEngine(creds, fmt); + engine = createClient({ url: creds.server, apiKey: creds.apiKey }); + } + + try { + const resolvedSource = await resolveWikipediaSource(source, opts, fmt); + + if (opts.downloadOnly) { + await output( + { + downloaded: resolvedSource.downloaded, + path: resolvedSource.sourcePath, + url: resolvedSource.sourceUrl, + wikiSlug: resolvedSource.wikiSlug, + dumpDate: resolvedSource.dumpDate, + dumpKind: resolvedSource.dumpKind, + dumpFormat: WIKIPEDIA_DUMP_FORMAT, + bytesDownloaded: resolvedSource.bytesDownloaded, + totalBytes: resolvedSource.totalBytes, + }, + fmt, + () => { + const verb = resolvedSource.downloaded + ? "Downloaded" + : "Using cached"; + clack.log.success(`${verb} ${resolvedSource.sourcePath}`); + console.log(` Format: ${WIKIPEDIA_DUMP_FORMAT}`); + if (resolvedSource.sourceUrl) { + console.log(` URL: ${resolvedSource.sourceUrl}`); + } + }, + ); + return; + } + + const parsedOptions = parseWikipediaImportOptions(opts); + const result = await runWikipediaImport({ + engine, + resolvedSource, + fmt, + dryRun: opts.dryRun === true, + verbose: opts.verbose === true, + treeRoot: parsedOptions.treeRoot, + namespace: parsedOptions.namespace, + includeRedirects: opts.includeRedirects === true, + contentMode: parsedOptions.contentMode, + maxContentBytes: parsedOptions.maxContentBytes, + limit: parsedOptions.limit, + batchSize: parsedOptions.batchSize, + updateExisting: opts.updateExisting === true, + }); + + await output(result, fmt, () => renderWikipediaImportResult(result)); + + if (result.failed > 0 && result.imported === 0 && !result.dryRun) { + process.exit(2); + } + if (result.failed > 0 && !result.dryRun) process.exit(1); + } catch (error) { + handleError(error, fmt); + } + }); +} + +interface ParsedWikipediaImportOptions { + treeRoot: string; + namespace: number; + contentMode: WikipediaContentMode; + maxContentBytes?: number; + limit?: number; + batchSize: number; +} + +function parseWikipediaImportOptions( + opts: Record, +): ParsedWikipediaImportOptions { + const treeRoot = String(opts.treeRoot ?? DEFAULT_TREE_ROOT); + if (!VALID_TREE_ROOT_RE.test(treeRoot)) { + throw new Error( + `Invalid --tree-root: '${treeRoot}'. Must match [a-z0-9_]+(\\.[a-z0-9_]+)*`, + ); + } + + const namespace = parseNonNegativeInteger("--namespace", opts.namespace); + const contentMode = String(opts.contentMode ?? "plain"); + if (contentMode !== "plain" && contentMode !== "wikitext") { + throw new Error("Invalid --content-mode: must be plain or wikitext"); + } + + const maxContentBytes = + opts.maxContentBytes === undefined + ? undefined + : parseNonNegativeInteger("--max-content-bytes", opts.maxContentBytes); + const limit = + opts.limit === undefined + ? undefined + : parsePositiveInteger("--limit", opts.limit); + const batchSize = parsePositiveInteger("--batch-size", opts.batchSize); + + return { + treeRoot, + namespace, + contentMode, + maxContentBytes: maxContentBytes === 0 ? undefined : maxContentBytes, + limit, + batchSize, + }; +} + +async function resolveWikipediaSource( + source: string | undefined, + opts: Record, + fmt: OutputFormat, +): Promise { + const cacheDir = resolve(expandHome(String(opts.cacheDir))); + const requestedDumpKind = String( + opts.dumpKind ?? DEFAULT_WIKIPEDIA_DUMP_KIND, + ); + const force = opts.forceDownload === true; + + if (source && isUrl(source)) { + const url = source; + const fileName = basename(new URL(url).pathname); + const sourcePath = join(cacheDir, fileName); + const wikiSlug = normalizeWikiSlug( + inferWikiSlugFromDumpName(fileName) ?? + String(opts.wiki ?? DEFAULT_WIKIPEDIA_WIKI), + ); + const dumpDate = + inferDumpDateFromDumpName(fileName) ?? + String(opts.date ?? DEFAULT_WIKIPEDIA_DUMP_DATE); + const dumpKind = inferDumpKindFromDumpName(fileName) ?? requestedDumpKind; + const downloaded = await downloadWikipediaSource( + url, + sourcePath, + force, + fmt, + ); + return { + wikiSlug, + dumpDate, + dumpKind, + sourceUrl: url, + sourcePath, + downloaded: downloaded.downloaded, + bytesDownloaded: downloaded.bytesDownloaded, + totalBytes: downloaded.totalBytes, + }; + } + + if (source && existsSync(resolve(expandHome(source)))) { + const sourcePath = resolve(expandHome(source)); + const fileName = basename(sourcePath); + return { + wikiSlug: normalizeWikiSlug( + inferWikiSlugFromDumpName(fileName) ?? + String(opts.wiki ?? DEFAULT_WIKIPEDIA_WIKI), + ), + dumpDate: + inferDumpDateFromDumpName(fileName) ?? + String(opts.date ?? DEFAULT_WIKIPEDIA_DUMP_DATE), + dumpKind: inferDumpKindFromDumpName(fileName) ?? requestedDumpKind, + sourcePath, + downloaded: false, + }; + } + + const sourceLooksLikeWikiSlug = source && VALID_WIKI_SLUG_RE.test(source); + if (source && !sourceLooksLikeWikiSlug) { + throw new Error( + `Source '${source}' is not a URL, an existing file, or a wiki slug like enwiki/simplewiki.`, + ); + } + + const wikiSlug = normalizeWikiSlug( + String(source ?? opts.wiki ?? DEFAULT_WIKIPEDIA_WIKI), + ); + + const dumpDate = String(opts.date ?? DEFAULT_WIKIPEDIA_DUMP_DATE); + const url = buildWikipediaDumpUrl(wikiSlug, dumpDate, requestedDumpKind); + const fileName = basename(new URL(url).pathname); + const sourcePath = join(cacheDir, fileName); + const downloaded = await downloadWikipediaSource(url, sourcePath, force, fmt); + + return { + wikiSlug, + dumpDate, + dumpKind: requestedDumpKind, + sourceUrl: url, + sourcePath, + downloaded: downloaded.downloaded, + bytesDownloaded: downloaded.bytesDownloaded, + totalBytes: downloaded.totalBytes, + }; +} + +async function downloadWikipediaSource( + url: string, + destinationPath: string, + force: boolean, + fmt: OutputFormat, +) { + let lastProgressAt = 0; + let wroteProgress = false; + return await downloadFile(url, destinationPath, { + force, + onProgress: ({ bytesDownloaded, totalBytes }) => { + if (fmt !== "text" || !process.stderr.isTTY) return; + const now = Date.now(); + if (now - lastProgressAt < 1000) return; + lastProgressAt = now; + wroteProgress = true; + const total = totalBytes ? ` / ${formatBytes(totalBytes)}` : ""; + process.stderr.write( + `\rDownloading ${formatBytes(bytesDownloaded)}${total}...`, + ); + }, + }).finally(() => { + if (wroteProgress) process.stderr.write("\n"); + }); +} + +interface RunWikipediaImportOptions { + engine?: EngineClient; + resolvedSource: ResolvedWikipediaSource; + fmt: OutputFormat; + dryRun: boolean; + verbose: boolean; + treeRoot: string; + namespace: number; + includeRedirects: boolean; + contentMode: WikipediaContentMode; + maxContentBytes?: number; + limit?: number; + batchSize: number; + updateExisting: boolean; +} + +async function runWikipediaImport( + options: RunWikipediaImportOptions, +): Promise { + const importedAt = new Date().toISOString(); + const stats: WikipediaImportStats = { + dryRun: options.dryRun, + dumpFormat: WIKIPEDIA_DUMP_FORMAT, + sourcePath: options.resolvedSource.sourcePath, + sourceUrl: options.resolvedSource.sourceUrl, + wikiSlug: options.resolvedSource.wikiSlug, + dumpDate: options.resolvedSource.dumpDate, + dumpKind: options.resolvedSource.dumpKind, + treeRoot: options.treeRoot, + namespace: options.namespace, + includeRedirects: options.includeRedirects, + contentMode: options.contentMode, + pagesScanned: 0, + namespaceSkipped: 0, + redirectsSkipped: 0, + emptyContentSkipped: 0, + memoriesPrepared: 0, + contentTruncated: 0, + imported: 0, + updated: 0, + skipped: 0, + failed: 0, + estimatedContentBytes: 0, + estimatedEmbeddingTokens: 0, + estimatedEmbeddingCostUsd: 0, + errors: [], + }; + + const pending: MemoryCreateParams[] = []; + let batchNumber = 0; + let stoppedEarly = false; + let lastProgressAt = 0; + const openedDump = openDumpTextStream(options.resolvedSource.sourcePath); + + const flushPending = async () => { + if (pending.length === 0) return; + const batch = pending.splice(0, pending.length); + batchNumber++; + + if (options.dryRun) { + if (options.verbose && options.fmt === "text") { + console.error( + `Validated batch ${batchNumber} (${batch.length} memories)`, + ); + } + return; + } + + if (!options.engine) + throw new Error("Engine client is required for import"); + const explicitIds = batch + .map((memory) => memory.id) + .filter((id): id is string => typeof id === "string"); + const { insertedIds, failedIds, errors } = await batchCreateChunked( + options.engine, + batch, + ); + stats.imported += insertedIds.length; + const insertedSet = new Set(insertedIds); + const failedSet = new Set(failedIds); + const skippedIds = explicitIds.filter( + (id) => !insertedSet.has(id) && !failedSet.has(id), + ); + + if (options.updateExisting) { + const payloadsById = new Map( + batch + .filter( + (memory): memory is MemoryCreateParams & { id: string } => + typeof memory.id === "string", + ) + .map((memory) => [memory.id, memory]), + ); + for (const skippedId of skippedIds) { + const payload = payloadsById.get(skippedId); + if (!payload) continue; + try { + await options.engine.memory.update({ + id: skippedId, + content: payload.content, + meta: payload.meta, + tree: payload.tree, + temporal: payload.temporal, + }); + stats.updated++; + } catch (error) { + stats.failed++; + stats.errors.push({ + source: `batch ${batchNumber}, update ${skippedId}`, + error: error instanceof Error ? error.message : String(error), + itemCount: 1, + }); + } + } + } else { + stats.skipped += skippedIds.length; + } + + for (const error of errors) { + stats.failed += error.itemCount; + stats.errors.push({ + source: `batch ${batchNumber}, chunk ${error.chunkIndex}`, + error: error.error, + itemCount: error.itemCount, + }); + } + + if (options.verbose && options.fmt === "text") { + console.error( + `Imported batch ${batchNumber}: +${insertedIds.length}, updated=${stats.updated}, skipped=${stats.skipped}, failed=${stats.failed}`, + ); + } + }; + + try { + for await (const page of streamMediaWikiPages(openedDump.stream)) { + stats.pagesScanned++; + + if (page.namespace !== options.namespace) { + stats.namespaceSkipped++; + maybeRenderProgress(options, stats, lastProgressAt, (next) => { + lastProgressAt = next; + }); + continue; + } + + const redirect = + page.redirectTitle !== undefined || + /^#REDIRECT\b/i.test(page.text.trim()); + if (redirect && !options.includeRedirects) { + stats.redirectsSkipped++; + maybeRenderProgress(options, stats, lastProgressAt, (next) => { + lastProgressAt = next; + }); + continue; + } + + const built = buildWikipediaMemory(page, { + wikiSlug: options.resolvedSource.wikiSlug, + treeRoot: options.treeRoot, + contentMode: options.contentMode, + sourceDumpPath: options.resolvedSource.sourcePath, + sourceDumpUrl: options.resolvedSource.sourceUrl, + sourceDumpDate: options.resolvedSource.dumpDate, + sourceDumpKind: options.resolvedSource.dumpKind, + importedAt, + maxContentBytes: options.maxContentBytes, + }); + + if (!built) { + stats.emptyContentSkipped++; + continue; + } + + stats.memoriesPrepared++; + stats.estimatedContentBytes += built.contentBytes; + if (built.truncated) stats.contentTruncated++; + pending.push(built.memory); + + if (pending.length >= options.batchSize) { + await flushPending(); + } + + maybeRenderProgress(options, stats, lastProgressAt, (next) => { + lastProgressAt = next; + }); + + if ( + options.limit !== undefined && + stats.memoriesPrepared >= options.limit + ) { + stoppedEarly = true; + break; + } + } + + await flushPending(); + } finally { + if (stoppedEarly) { + openedDump.close(); + await openedDump.completion.catch(() => {}); + } else { + await openedDump.completion; + } + if (options.fmt === "text" && process.stderr.isTTY) + process.stderr.write("\n"); + } + + stats.estimatedEmbeddingTokens = estimateEmbeddingTokens( + stats.estimatedContentBytes, + ); + stats.estimatedEmbeddingCostUsd = estimateEmbeddingCostUsd( + stats.estimatedEmbeddingTokens, + ); + + return stats; +} + +function maybeRenderProgress( + options: RunWikipediaImportOptions, + stats: WikipediaImportStats, + lastProgressAt: number, + setLastProgressAt: (timestamp: number) => void, +): void { + if (options.fmt !== "text" || !process.stderr.isTTY) return; + const now = Date.now(); + if (now - lastProgressAt < 2000) return; + setLastProgressAt(now); + process.stderr.write( + `\rScanned ${formatInteger(stats.pagesScanned)} pages; prepared ${formatInteger( + stats.memoriesPrepared, + )} article memories; imported ${formatInteger(stats.imported)}...`, + ); +} + +function renderWikipediaImportResult(result: WikipediaImportStats): void { + const preparedOrImported = result.dryRun + ? result.memoriesPrepared + : result.imported; + let summary = `${result.dryRun ? "Would import" : "Imported"} ${formatInteger(preparedOrImported)} Wikipedia article ${preparedOrImported === 1 ? "memory" : "memories"}`; + if (!result.dryRun && result.imported === 0 && result.updated > 0) { + summary = `Updated ${formatInteger(result.updated)} existing Wikipedia article ${result.updated === 1 ? "memory" : "memories"}`; + } else if (!result.dryRun && result.updated > 0) { + summary = `${summary} and updated ${formatInteger(result.updated)} existing`; + } + clack.log.success(summary); + console.log(` Wiki: ${result.wikiSlug}`); + console.log(` Format: ${result.dumpFormat}`); + console.log(` Source: ${result.sourcePath}`); + console.log(` Tree root: ${result.treeRoot}`); + console.log(` Pages scanned: ${formatInteger(result.pagesScanned)}`); + console.log( + ` Article memories prepared: ${formatInteger(result.memoriesPrepared)}`, + ); + if (result.updated > 0) { + console.log(` Updated existing: ${formatInteger(result.updated)}`); + } + if (result.skipped > 0) { + console.log(` Already existed: ${formatInteger(result.skipped)}`); + } + if (result.failed > 0) { + console.log(` Failed: ${formatInteger(result.failed)}`); + } + if (result.redirectsSkipped > 0 || result.namespaceSkipped > 0) { + console.log( + ` Skipped: redirects=${formatInteger(result.redirectsSkipped)}, namespace=${formatInteger(result.namespaceSkipped)}, empty=${formatInteger(result.emptyContentSkipped)}`, + ); + } + if (result.contentTruncated > 0) { + console.log(` Truncated: ${formatInteger(result.contentTruncated)}`); + } + console.log( + ` Estimated embedded content: ${formatBytes(result.estimatedContentBytes)} ≈ ${formatInteger(result.estimatedEmbeddingTokens)} tokens (~$${result.estimatedEmbeddingCostUsd.toFixed(2)} with text-embedding-3-small)`, + ); + if (result.errors.length > 0) { + console.log(` Errors: ${result.errors.length}`); + for (const error of result.errors.slice(0, 10)) { + console.log(` ${error.source}: ${error.error}`); + } + if (result.errors.length > 10) { + console.log(` ... ${result.errors.length - 10} more`); + } + } +} + +function defaultWikipediaCacheDir(): string { + const base = process.env.XDG_CACHE_HOME || join(homedir(), ".cache"); + return join(base, "memory-engine", "wikipedia"); +} + +function expandHome(path: string): string { + if (path === "~") return homedir(); + if (path.startsWith("~/")) return join(homedir(), path.slice(2)); + return path; +} + +function isUrl(value: string): boolean { + return /^https?:\/\//i.test(value); +} + +function normalizeWikiSlug(value: string): string { + const wikiSlug = value.toLowerCase(); + if (!VALID_WIKI_SLUG_RE.test(wikiSlug)) { + throw new Error( + `Invalid wiki slug '${wikiSlug}'. Use a Wikimedia database name like enwiki or simplewiki.`, + ); + } + return wikiSlug; +} + +function parseNonNegativeInteger(name: string, value: unknown): number { + const parsed = Number.parseInt(String(value), 10); + if (!Number.isInteger(parsed) || parsed < 0) { + throw new Error(`Invalid ${name}: expected a non-negative integer`); + } + return parsed; +} + +function parsePositiveInteger(name: string, value: unknown): number { + const parsed = Number.parseInt(String(value), 10); + if (!Number.isInteger(parsed) || parsed <= 0) { + throw new Error(`Invalid ${name}: expected a positive integer`); + } + return parsed; +} + +function estimateEmbeddingTokens(contentBytes: number): number { + return Math.ceil(contentBytes / 4); +} + +function estimateEmbeddingCostUsd(tokens: number): number { + return ( + (tokens / 1_000_000) * OPENAI_TEXT_EMBEDDING_3_SMALL_USD_PER_MILLION_TOKENS + ); +} + +function formatBytes(bytes: number): string { + const units = ["B", "KB", "MB", "GB", "TB"]; + let value = bytes; + let unitIndex = 0; + while (value >= 1024 && unitIndex < units.length - 1) { + value /= 1024; + unitIndex++; + } + const decimals = unitIndex === 0 ? 0 : value >= 10 ? 1 : 2; + return `${value.toFixed(decimals)} ${units[unitIndex]}`; +} + +function formatInteger(value: number): string { + return new Intl.NumberFormat("en-US").format(value); +} diff --git a/packages/cli/index.ts b/packages/cli/index.ts index 3738174..6a39597 100755 --- a/packages/cli/index.ts +++ b/packages/cli/index.ts @@ -30,6 +30,7 @@ import { createUpgradeCommand } from "./commands/upgrade.ts"; import { createUserCommand } from "./commands/user.ts"; import { createVersionCommand } from "./commands/version.ts"; import { createWhoamiCommand } from "./commands/whoami.ts"; +import { createWikipediaCommand } from "./commands/wikipedia.ts"; import { setExpanded } from "./output.ts"; const SHELLS = ["zsh", "bash", "fish", "powershell"] as const; @@ -88,6 +89,9 @@ program.addCommand(createCodexCommand()); // Local web UI program.addCommand(createServeCommand()); +// Dataset imports +program.addCommand(createWikipediaCommand()); + // Engine-level RBAC commands program.addCommand(createUserCommand()); program.addCommand(createGrantCommand()); diff --git a/packages/cli/wikipedia.test.ts b/packages/cli/wikipedia.test.ts new file mode 100644 index 0000000..8b16fae --- /dev/null +++ b/packages/cli/wikipedia.test.ts @@ -0,0 +1,197 @@ +import { describe, expect, test } from "bun:test"; +import { + buildWikipediaArticleUrl, + buildWikipediaDumpUrl, + buildWikipediaMemory, + cleanWikitextToPlainText, + deterministicWikipediaPageUuidV7, + extractCategories, + inferDumpDateFromDumpName, + inferDumpKindFromDumpName, + inferWikiSlugFromDumpName, + parseMediaWikiPageXml, + streamMediaWikiPages, +} from "./wikipedia.ts"; + +const UUIDV7_RE = + /^[0-9a-f]{8}-[0-9a-f]{4}-7[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i; + +describe("Wikipedia dump helpers", () => { + test("builds canonical Wikimedia dump URLs", () => { + expect(buildWikipediaDumpUrl("simplewiki")).toBe( + "https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles-multistream.xml.bz2", + ); + expect(buildWikipediaDumpUrl("enwiki", "20260501", "pages-articles")).toBe( + "https://dumps.wikimedia.org/enwiki/20260501/enwiki-20260501-pages-articles.xml.bz2", + ); + }); + + test("infers wiki slug and date from dump names", () => { + const name = "enwiki-20260501-pages-articles-multistream.xml.bz2"; + expect(inferWikiSlugFromDumpName(name)).toBe("enwiki"); + expect(inferDumpDateFromDumpName(name)).toBe("20260501"); + expect(inferDumpKindFromDumpName(name)).toBe("pages-articles-multistream"); + }); + + test("parses one MediaWiki page XML block", () => { + const page = parseMediaWikiPageXml(` + + PostgreSQL + 0 + 23456 + + 98765 + 2026-05-01T12:34:56Z + wikitext + text/x-wiki + '''PostgreSQL''' is an [[open-source]] database & server. + abc123 + +`); + + expect(page).toEqual({ + title: "PostgreSQL", + namespace: 0, + pageId: "23456", + revisionId: "98765", + timestamp: "2026-05-01T12:34:56Z", + text: "'''PostgreSQL''' is an [[open-source]] database & server.", + redirectTitle: undefined, + model: "wikitext", + format: "text/x-wiki", + sha1: "abc123", + textBytes: 42, + }); + }); + + test("streams pages across chunk boundaries", async () => { + const xml = `A0111AlphaB0222Beta`; + const encoder = new TextEncoder(); + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(encoder.encode(xml.slice(0, 75))); + controller.enqueue(encoder.encode(xml.slice(75))); + controller.close(); + }, + }); + + const pages = []; + for await (const page of streamMediaWikiPages(stream)) { + pages.push(page.title); + } + + expect(pages).toEqual(["A", "B"]); + }); + + test("extracts categories", () => { + expect( + extractCategories( + "[[Category:Relational databases]] [[Category:Free software|Databases]] [[category:Relational databases]]", + ), + ).toEqual(["Relational databases", "Free software"]); + }); + + test("cleans wikitext into readable text", () => { + const cleaned = cleanWikitextToPlainText(`{{Infobox}} +[[File:Fan.jpg|thumb|A [[fan]] moves air.]] +'''PostgreSQL''' is an [[open-source software|open-source]] [[database]].noise + +== History == +* Created at [https://example.com Berkeley] + +== References == +* [https://example.com] + +== Empty section == +[[Category:Relational databases]]`); + + expect(cleaned).toContain("PostgreSQL is an open-source database."); + expect(cleaned).toContain("## History"); + expect(cleaned).toContain("- Created at Berkeley"); + expect(cleaned).not.toContain("Infobox"); + expect(cleaned).not.toContain("moves air"); + expect(cleaned).not.toContain("Category"); + expect(cleaned).not.toContain("References"); + expect(cleaned).not.toContain("Empty section"); + expect(cleaned).not.toContain("\n-\n"); + expect(cleaned).not.toContain("noise"); + expect(cleaned).not.toContain("]]"); + }); + + test("builds memory payload with stable metadata", () => { + const page = parseMediaWikiPageXml(` + + PostgreSQL + 0 + 23456 + + 98765 + 2026-05-01T12:34:56Z + wikitext + text/x-wiki + '''PostgreSQL''' is an [[open-source software|open-source]] database. [[Category:Relational databases]] + abc123 + +`); + expect(page).not.toBeNull(); + + const built = buildWikipediaMemory(page!, { + wikiSlug: "enwiki", + treeRoot: "wikipedia", + contentMode: "plain", + sourceDumpPath: "/tmp/enwiki-latest-pages-articles-multistream.xml.bz2", + sourceDumpUrl: + "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2", + sourceDumpDate: "latest", + sourceDumpKind: "pages-articles-multistream", + importedAt: "2026-05-07T00:00:00.000Z", + }); + + expect(built).not.toBeNull(); + expect(built!.memory.id).toMatch(UUIDV7_RE); + expect(built!.memory.tree).toBe("wikipedia.relational_databases"); + expect(built!.memory.content).toContain( + "# PostgreSQL\n\nPostgreSQL is an open-source database.", + ); + expect(built!.memory.meta).toMatchObject({ + type: "wikipedia_article", + source: "wikipedia", + source_wiki: "enwiki", + source_page_id: "23456", + source_revision_id: "98765", + source_title: "PostgreSQL", + source_url: "https://en.wikipedia.org/wiki/PostgreSQL", + source_format: "mediawiki_xml", + content_format: "plain_text", + categories: ["Relational databases"], + primary_category: "Relational databases", + primary_category_slug: "relational_databases", + article_slug: "postgresql", + imported_at: "2026-05-07T00:00:00.000Z", + importer_version: "1", + }); + expect(built!.memory.temporal).toEqual({ + start: "2026-05-01T12:34:56.000Z", + }); + }); + + test("deterministic page ids are stable and page-keyed", () => { + const first = deterministicWikipediaPageUuidV7("enwiki", "23456"); + const second = deterministicWikipediaPageUuidV7("enwiki", "23456"); + const differentWiki = deterministicWikipediaPageUuidV7( + "simplewiki", + "23456", + ); + + expect(first).toBe(second); + expect(first).toMatch(UUIDV7_RE); + expect(differentWiki).not.toBe(first); + }); + + test("builds article URLs", () => { + expect(buildWikipediaArticleUrl("enwiki", "A/B test")); + expect(buildWikipediaArticleUrl("enwiki", "A/B test")).toBe( + "https://en.wikipedia.org/wiki/A%2FB_test", + ); + }); +}); diff --git a/packages/cli/wikipedia.ts b/packages/cli/wikipedia.ts new file mode 100644 index 0000000..4fe11a4 --- /dev/null +++ b/packages/cli/wikipedia.ts @@ -0,0 +1,823 @@ +/** + * Wikipedia dump import helpers. + * + * Wikimedia's public database dumps for article text are MediaWiki XML export + * files, most commonly distributed as bzip2 archives named like: + * + * enwiki-latest-pages-articles-multistream.xml.bz2 + * + * The "multistream" suffix means the .bz2 file is composed of multiple bzip2 + * streams plus a companion index file. For a sequential import we can treat it + * as a normal bzip2-compressed XML file and stream-decompress it. + */ + +import { spawn } from "node:child_process"; +import { createHash } from "node:crypto"; +import { once } from "node:events"; +import { + createWriteStream, + existsSync, + mkdirSync, + renameSync, + unlinkSync, +} from "node:fs"; +import { dirname } from "node:path"; +import { Readable } from "node:stream"; +import type { MemoryCreateParams } from "@memory.build/protocol/engine"; +import { normalizeSlug } from "./importers/slug.ts"; + +export const DEFAULT_WIKIPEDIA_WIKI = "simplewiki"; +export const DEFAULT_WIKIPEDIA_DUMP_DATE = "latest"; +export const DEFAULT_WIKIPEDIA_DUMP_KIND = "pages-articles-multistream"; +export const WIKIPEDIA_DUMP_FORMAT = + "MediaWiki XML export (usually bzip2-compressed .xml.bz2)"; +export const WIKIPEDIA_IMPORTER_VERSION = "1"; + +const WIKIPEDIA_LAUNCH_TIMESTAMP_MS = Date.UTC(2001, 0, 15); + +export type WikipediaContentMode = "plain" | "wikitext"; + +export interface WikipediaPage { + title: string; + namespace: number; + pageId: string; + revisionId: string; + timestamp?: string; + text: string; + redirectTitle?: string; + model?: string; + format?: string; + sha1?: string; + textBytes?: number; +} + +export interface WikipediaMemoryBuildOptions { + wikiSlug: string; + treeRoot: string; + contentMode: WikipediaContentMode; + sourceDumpPath?: string; + sourceDumpUrl?: string; + sourceDumpDate?: string; + sourceDumpKind?: string; + importedAt: string; + maxContentBytes?: number; +} + +export interface BuiltWikipediaMemory { + memory: MemoryCreateParams; + categories: string[]; + truncated: boolean; + contentBytes: number; + articleSlug: string; +} + +export interface DownloadFileResult { + path: string; + downloaded: boolean; + bytesDownloaded: number; + totalBytes?: number; +} + +export interface DownloadFileOptions { + force?: boolean; + onProgress?: (progress: { + bytesDownloaded: number; + totalBytes?: number; + }) => void | Promise; +} + +export interface OpenedDumpTextStream { + stream: ReadableStream; + completion: Promise; + close: () => void; +} + +/** Build the canonical Wikimedia dump URL for a wiki database name. */ +export function buildWikipediaDumpUrl( + wikiSlug: string, + dumpDate = DEFAULT_WIKIPEDIA_DUMP_DATE, + dumpKind = DEFAULT_WIKIPEDIA_DUMP_KIND, +): string { + return `https://dumps.wikimedia.org/${wikiSlug}/${dumpDate}/${wikiSlug}-${dumpDate}-${dumpKind}.xml.bz2`; +} + +/** Infer `enwiki` / `simplewiki` from a standard Wikimedia dump filename. */ +export function inferWikiSlugFromDumpName( + fileName: string, +): string | undefined { + const match = /^([a-z0-9_]+)-(?:latest|\d{8})-[^.]+/i.exec(fileName); + return match?.[1]?.toLowerCase(); +} + +/** Infer `latest` / `20260501` from a standard Wikimedia dump filename. */ +export function inferDumpDateFromDumpName( + fileName: string, +): string | undefined { + const match = /^[a-z0-9_]+-((?:latest|\d{8}))-[^.]+/i.exec(fileName); + return match?.[1]?.toLowerCase(); +} + +/** Infer `pages-articles-multistream` from a standard dump filename. */ +export function inferDumpKindFromDumpName( + fileName: string, +): string | undefined { + const match = /^[a-z0-9_]+-(?:latest|\d{8})-(.+?)\.xml(?:\.bz2)?$/i.exec( + fileName, + ); + return match?.[1]?.toLowerCase(); +} + +/** Download a URL to disk using a streaming response body. */ +export async function downloadFile( + url: string, + destinationPath: string, + options: DownloadFileOptions = {}, +): Promise { + if (existsSync(destinationPath) && !options.force) { + const size = await Bun.file(destinationPath).size; + return { + path: destinationPath, + downloaded: false, + bytesDownloaded: size, + totalBytes: size, + }; + } + + mkdirSync(dirname(destinationPath), { recursive: true }); + const temporaryPath = `${destinationPath}.part`; + try { + if (existsSync(temporaryPath)) unlinkSync(temporaryPath); + } catch { + // Best effort cleanup; createWriteStream will surface a real error below. + } + + const response = await fetch(url); + if (!response.ok) { + throw new Error(`Failed to download ${url}: HTTP ${response.status}`); + } + if (!response.body) { + throw new Error(`Failed to download ${url}: empty response body`); + } + + const totalHeader = response.headers.get("content-length"); + const totalBytes = totalHeader ? Number.parseInt(totalHeader, 10) : undefined; + const output = createWriteStream(temporaryPath); + const reader = response.body.getReader(); + let bytesDownloaded = 0; + + try { + while (true) { + const { value, done } = await reader.read(); + if (done) break; + bytesDownloaded += value.byteLength; + if (!output.write(Buffer.from(value))) { + await once(output, "drain"); + } + await options.onProgress?.({ bytesDownloaded, totalBytes }); + } + } catch (error) { + output.destroy(); + try { + unlinkSync(temporaryPath); + } catch { + // Ignore cleanup failures. + } + throw error; + } + + output.end(); + await once(output, "finish"); + renameSync(temporaryPath, destinationPath); + + return { + path: destinationPath, + downloaded: true, + bytesDownloaded, + totalBytes, + }; +} + +/** + * Open an XML or XML.bz2 dump as a UTF-8 byte stream. + * + * Bun/Node do not ship a native bzip2 decoder, so compressed Wikimedia dumps + * are decompressed by invoking an installed bzip2-compatible CLI. We prefer + * parallel implementations when present, then fall back to the ubiquitous + * `bzip2 -dc`. + */ +export function openDumpTextStream(dumpPath: string): OpenedDumpTextStream { + if (!dumpPath.toLowerCase().endsWith(".bz2")) { + return { + stream: Bun.file(dumpPath).stream() as ReadableStream, + completion: Promise.resolve(), + close: () => {}, + }; + } + + const decompressor = findBzip2Decompressor(); + if (!decompressor) { + throw new Error( + "No bzip2 decompressor found. Install bzip2, lbzip2, pbzip2, or bzcat to read Wikipedia .xml.bz2 dumps.", + ); + } + + const args = decompressor === "bzcat" ? [dumpPath] : ["-dc", dumpPath]; + const child = spawn(decompressor, args, { + stdio: ["ignore", "pipe", "pipe"], + }); + + if (!child.stdout) { + throw new Error(`Failed to open ${decompressor} stdout`); + } + + let stderr = ""; + child.stderr?.setEncoding("utf8"); + child.stderr?.on("data", (chunk: string) => { + stderr = `${stderr}${chunk}`.slice(-4096); + }); + + let closeRequested = false; + const completion = new Promise((resolve, reject) => { + child.on("error", reject); + child.on("close", (code, signal) => { + const detail = stderr.trim() ? `: ${stderr.trim()}` : ""; + const stoppedByConsumer = + closeRequested || signal === "SIGTERM" || /broken pipe/i.test(stderr); + if (code === 0 || stoppedByConsumer) { + resolve(); + return; + } + reject( + new Error( + `${decompressor} exited ${signal ? `with signal ${signal}` : `with code ${code}`}${detail}`, + ), + ); + }); + }); + + return { + stream: Readable.toWeb(child.stdout) as ReadableStream, + completion, + close: () => { + closeRequested = true; + if (!child.killed) child.kill("SIGTERM"); + }, + }; +} + +function findBzip2Decompressor(): string | undefined { + for (const command of ["lbzip2", "pbzip2", "bzip2", "bzcat"]) { + if (Bun.which(command)) return command; + } + return undefined; +} + +/** Stream MediaWiki pages from a decompressed dump without loading the file. */ +export async function* streamMediaWikiPages( + stream: ReadableStream, +): AsyncGenerator { + const reader = stream.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + + try { + while (true) { + const { value, done } = await reader.read(); + if (done) { + buffer += decoder.decode(); + break; + } + buffer += decoder.decode(value, { stream: true }); + + yield* drainCompletePagesFromBuffer( + () => buffer, + (next) => { + buffer = next; + }, + ); + } + + yield* drainCompletePagesFromBuffer( + () => buffer, + (next) => { + buffer = next; + }, + ); + } finally { + await reader.cancel().catch(() => {}); + } +} + +function* drainCompletePagesFromBuffer( + getBuffer: () => string, + setBuffer: (next: string) => void, +): Generator { + let buffer = getBuffer(); + while (true) { + const start = buffer.indexOf(""); + if (start === -1) { + // Keep only a small suffix in case '' is split across chunks. + setBuffer(buffer.slice(-16)); + return; + } + if (start > 0) buffer = buffer.slice(start); + + const end = buffer.indexOf(""); + if (end === -1) { + setBuffer(buffer); + return; + } + + const pageXml = buffer.slice(0, end + "".length); + buffer = buffer.slice(end + "".length); + const page = parseMediaWikiPageXml(pageXml); + if (page) yield page; + } +} + +/** Parse one ... block from a MediaWiki XML export. */ +export function parseMediaWikiPageXml(pageXml: string): WikipediaPage | null { + const revisionStart = pageXml.indexOf(""); + const pageHeaderXml = + revisionStart === -1 ? pageXml : pageXml.slice(0, revisionStart); + const revisionXml = revisionStart === -1 ? "" : pageXml.slice(revisionStart); + + const title = extractXmlTagText(pageHeaderXml, "title"); + const namespaceText = extractXmlTagText(pageHeaderXml, "ns"); + const pageId = extractXmlTagText(pageHeaderXml, "id"); + const revisionId = extractXmlTagText(revisionXml, "id") ?? ""; + if (!title || !namespaceText || !pageId) return null; + + const namespace = Number.parseInt(namespaceText, 10); + if (Number.isNaN(namespace)) return null; + + const redirectMatch = /]*)\/?\s*>/i.exec(pageHeaderXml); + const redirectTitle = redirectMatch + ? extractXmlAttribute(redirectMatch[1] ?? "", "title") + : undefined; + + const textMatch = /]*)>([\s\S]*?)<\/text>/i.exec(revisionXml); + const selfClosingTextMatch = /]*)\/\s*>/i.exec(revisionXml); + const textAttributes = textMatch?.[1] ?? selfClosingTextMatch?.[1] ?? ""; + const text = textMatch ? decodeXmlEntities(textMatch[2] ?? "") : ""; + const textBytesRaw = extractXmlAttribute(textAttributes, "bytes"); + const textBytes = textBytesRaw + ? Number.parseInt(textBytesRaw, 10) + : undefined; + + return { + title, + namespace, + pageId, + revisionId, + timestamp: extractXmlTagText(revisionXml, "timestamp") ?? undefined, + text, + redirectTitle, + model: extractXmlTagText(revisionXml, "model") ?? undefined, + format: extractXmlTagText(revisionXml, "format") ?? undefined, + sha1: extractXmlTagText(revisionXml, "sha1") ?? undefined, + textBytes: Number.isFinite(textBytes) ? textBytes : undefined, + }; +} + +function extractXmlTagText(xml: string, tagName: string): string | null { + const match = new RegExp( + `<${tagName}\\b[^>]*>([\\s\\S]*?)<\\/${tagName}>`, + "i", + ).exec(xml); + return match ? decodeXmlEntities(match[1] ?? "") : null; +} + +function extractXmlAttribute( + attributes: string, + name: string, +): string | undefined { + const doubleQuoted = new RegExp(`${name}="([^"]*)"`, "i").exec(attributes); + if (doubleQuoted) return decodeXmlEntities(doubleQuoted[1] ?? ""); + const singleQuoted = new RegExp(`${name}='([^']*)'`, "i").exec(attributes); + return singleQuoted ? decodeXmlEntities(singleQuoted[1] ?? "") : undefined; +} + +/** Decode XML entities plus common HTML entities that survive wikitext cleanup. */ +export function decodeXmlEntities(value: string): string { + const namedEntities: Record = { + amp: "&", + lt: "<", + gt: ">", + quot: '"', + apos: "'", + nbsp: " ", + ndash: "–", + mdash: "—", + }; + + return value.replace( + /&(#x[0-9a-fA-F]+|#[0-9]+|[A-Za-z][A-Za-z0-9]+);/g, + (entity, body: string) => { + if (body.startsWith("#x")) { + const codePoint = Number.parseInt(body.slice(2), 16); + return Number.isFinite(codePoint) + ? String.fromCodePoint(codePoint) + : entity; + } + if (body.startsWith("#")) { + const codePoint = Number.parseInt(body.slice(1), 10); + return Number.isFinite(codePoint) + ? String.fromCodePoint(codePoint) + : entity; + } + return namedEntities[body.toLowerCase()] ?? entity; + }, + ); +} + +/** Extract article categories from raw wikitext before category links are stripped. */ +export function extractCategories(wikitext: string): string[] { + const categories: string[] = []; + const seen = new Set(); + const re = /\[\[\s*Category\s*:\s*([^\]|#]+)(?:#[^\]|]*)?(?:\|[^\]]*)?\]\]/gi; + for (const match of wikitext.matchAll(re)) { + const category = decodeXmlEntities(match[1] ?? "") + .replace(/_/g, " ") + .replace(/\s+/g, " ") + .trim(); + const key = category.toLowerCase(); + if (category && !seen.has(key)) { + seen.add(key); + categories.push(category); + } + } + return categories; +} + +/** + * Lightweight wikitext-to-plain-text conversion. + * + * This intentionally favors speed and predictable memory use over perfect + * MediaWiki rendering. It removes high-noise constructs (templates, refs, + * tables, files, categories) and keeps readable article prose plus headings. + */ +export function cleanWikitextToPlainText(wikitext: string): string { + let text = wikitext; + + text = text.replace(//g, ""); + text = text.replace(/]*\/>/gi, ""); + text = text.replace(/]*>[\s\S]*?<\/ref>/gi, ""); + text = text.replace(/]*\/>/gi, ""); + text = text.replace(/]*>[\s\S]*?<\/gallery>/gi, ""); + text = text.replace(/]*>[\s\S]*?<\/timeline>/gi, ""); + text = text.replace(/]*>[\s\S]*?<\/score>/gi, ""); + text = text.replace(/]*>[\s\S]*?<\/math>/gi, ""); + + text = stripWikiTables(text); + text = stripBalancedTemplates(text); + + // Drop file/image links and category declarations before generic link cleanup. + // File captions often contain nested links, so this must be balanced instead + // of a single regex; otherwise captions leak through as `...]]` fragments. + text = stripWikiLinksByNamespace(text, ["file", "image", "category"]); + + const headingSentinel = "\uE000"; + text = text.replace( + /^(={2,6})\s*(.*?)\s*\1\s*$/gm, + (_match, marker: string, heading: string) => { + const markdownLevel = Math.min(marker.length, 6); + return `${headingSentinel}${"#".repeat(markdownLevel)} ${heading.trim()}`; + }, + ); + + text = text.replace(/'''([^'].*?)'''/g, "$1"); + text = text.replace(/''([^'].*?)''/g, "$1"); + + // External links: keep labels, remove bare URLs. + text = text.replace(/\[https?:\/\/[^\s\]]+\s+([^\]]+)\]/gi, "$1"); + text = text.replace(/\[https?:\/\/[^\]]+\]/gi, ""); + + // Internal links: [[Target|label]] -> label, [[Target]] -> Target. + text = text.replace(/\[\[([^[\]\n]+?)\]\]/g, (_match, linkBody: string) => { + const parts = linkBody.split("|"); + const target = (parts[0] ?? "").trim().replace(/^:/, ""); + if (/^(?:category|file|image):/i.test(target)) return ""; + const label = (parts.length > 1 ? parts[parts.length - 1] : target) ?? ""; + return label.replace(/_/g, " ").replace(/^:/, "").trim(); + }); + + text = text.replace(//gi, "\n"); + text = text.replace(/<\/(?:p|div|section)>/gi, "\n\n"); + text = text.replace(/<[^>]+>/g, ""); + text = decodeXmlEntities(text); + + // Wikitext list markers to readable plain/markdown-ish markers. + text = text + .split("\n") + .map((line) => { + const trimmed = line.trimEnd(); + if (trimmed.startsWith(headingSentinel)) return trimmed.slice(1); + if (/^\*+\s*/.test(trimmed)) return trimmed.replace(/^\*+\s*/, "- "); + if (/^#+\s*/.test(trimmed)) return trimmed.replace(/^#+\s*/, "1. "); + if (/^[;:]+\s*/.test(trimmed)) return trimmed.replace(/^[;:]+\s*/, ""); + return trimmed; + }) + .join("\n"); + + // Remove lingering table row syntax and magic words. + text = text.replace(/^\s*(?:\|-|[|!])[^\n]*$/gm, ""); + text = text.replace(/__[A-Z_]+__/g, ""); + + const cleaned = text + .replace(/[ \t]+/g, " ") + .replace(/[ \t]+\n/g, "\n") + .replace(/\n{3,}/g, "\n\n") + .trim(); + + return removeEmptyListItemsAndSections(cleaned); +} + +function removeEmptyListItemsAndSections(input: string): string { + let lines = input.split("\n").filter((line) => !/^\s*[-*]\s*$/.test(line)); + let previousLength = -1; + while (lines.length !== previousLength) { + previousLength = lines.length; + lines = removeEmptySections(lines); + } + return lines + .join("\n") + .replace(/\n{3,}/g, "\n\n") + .trim(); +} + +function removeEmptySections(lines: string[]): string[] { + const output: string[] = []; + for (let index = 0; index < lines.length; index++) { + const line = lines[index] ?? ""; + const level = markdownSectionHeadingLevel(line); + if (level === null) { + output.push(line); + continue; + } + + let sectionEnd = index + 1; + while (sectionEnd < lines.length) { + const nextLevel = markdownSectionHeadingLevel(lines[sectionEnd] ?? ""); + if (nextLevel !== null && nextLevel <= level) break; + sectionEnd++; + } + + const hasSectionContent = lines + .slice(index + 1, sectionEnd) + .some( + (candidateLine) => + candidateLine.trim().length > 0 && + markdownSectionHeadingLevel(candidateLine) === null, + ); + if (hasSectionContent) output.push(line); + } + return output; +} + +function markdownSectionHeadingLevel(line: string): number | null { + const match = /^(#{2,6})\s+\S/.exec(line.trim()); + return match ? (match[1] ?? "").length : null; +} + +function stripWikiTables(input: string): string { + let previous = input; + while (true) { + const next = previous.replace(/\{\|[\s\S]*?\|\}/g, "\n"); + if (next === previous) return next; + previous = next; + } +} + +function stripWikiLinksByNamespace( + input: string, + namespaces: string[], +): string { + const namespaceSet = new Set( + namespaces.map((namespace) => namespace.toLowerCase()), + ); + let output = ""; + let index = 0; + + while (index < input.length) { + const start = input.indexOf("[[", index); + if (start === -1) { + output += input.slice(index); + break; + } + + const linkPrefix = /^\[\[\s*:?\s*([A-Za-z]+)\s*:/i.exec( + input.slice(start, start + 80), + ); + if (!linkPrefix || !namespaceSet.has((linkPrefix[1] ?? "").toLowerCase())) { + output += input.slice(index, start + 2); + index = start + 2; + continue; + } + + output += input.slice(index, start); + let depth = 1; + let cursor = start + 2; + while (cursor < input.length && depth > 0) { + if (input.startsWith("[[", cursor)) { + depth++; + cursor += 2; + } else if (input.startsWith("]]", cursor)) { + depth--; + cursor += 2; + } else { + cursor++; + } + } + index = cursor; + } + + return output; +} + +function stripBalancedTemplates(input: string): string { + let output = ""; + let depth = 0; + for (let index = 0; index < input.length; index++) { + if (input.startsWith("{{", index)) { + depth++; + index++; + continue; + } + if (depth > 0 && input.startsWith("}}", index)) { + depth--; + index++; + continue; + } + if (depth === 0) output += input[index] ?? ""; + } + return output; +} + +/** Build a MemoryCreateParams payload for one parsed article page. */ +export function buildWikipediaMemory( + page: WikipediaPage, + options: WikipediaMemoryBuildOptions, +): BuiltWikipediaMemory | null { + const categories = extractCategories(page.text); + const body = + options.contentMode === "wikitext" + ? page.text.trim() + : cleanWikitextToPlainText(page.text); + if (!body) return null; + + const rawContent = `# ${page.title}\n\n${body}`; + const truncated = truncateUtf8(rawContent, options.maxContentBytes); + const content = truncated.text; + const articleSlug = normalizeSlug(page.title); + const primaryCategory = categories[0] ?? "Uncategorized"; + const primaryCategorySlug = normalizeSlug(primaryCategory); + const tree = `${options.treeRoot}.${primaryCategorySlug}`; + const sourceUrl = buildWikipediaArticleUrl(options.wikiSlug, page.title); + const temporalStart = + page.timestamp && !Number.isNaN(Date.parse(page.timestamp)) + ? new Date(Date.parse(page.timestamp)).toISOString() + : undefined; + + const meta: Record = { + type: "wikipedia_article", + source: "wikipedia", + source_wiki: options.wikiSlug, + source_page_id: page.pageId, + source_revision_id: page.revisionId, + source_title: page.title, + source_namespace: page.namespace, + source_url: sourceUrl, + source_format: "mediawiki_xml", + content_format: + options.contentMode === "wikitext" ? "mediawiki_wikitext" : "plain_text", + categories, + primary_category: primaryCategory, + primary_category_slug: primaryCategorySlug, + article_slug: articleSlug, + imported_at: options.importedAt, + importer_version: WIKIPEDIA_IMPORTER_VERSION, + }; + + if (options.sourceDumpPath) meta.source_dump_path = options.sourceDumpPath; + if (options.sourceDumpUrl) meta.source_dump_url = options.sourceDumpUrl; + if (options.sourceDumpDate) meta.source_dump_date = options.sourceDumpDate; + if (options.sourceDumpKind) meta.source_dump_kind = options.sourceDumpKind; + if (page.timestamp) meta.source_revision_timestamp = page.timestamp; + if (page.redirectTitle) meta.source_redirect_title = page.redirectTitle; + if (page.model) meta.source_model = page.model; + if (page.format) meta.source_text_format = page.format; + if (page.sha1) meta.source_revision_sha1 = page.sha1; + if (page.textBytes !== undefined) meta.source_text_bytes = page.textBytes; + if (truncated.truncated) meta.content_truncated = true; + if (options.maxContentBytes !== undefined) { + meta.max_content_bytes = options.maxContentBytes; + } + + return { + memory: { + id: deterministicWikipediaPageUuidV7(options.wikiSlug, page.pageId), + content, + tree, + meta, + ...(temporalStart ? { temporal: { start: temporalStart } } : {}), + }, + categories, + truncated: truncated.truncated, + contentBytes: Buffer.byteLength(content, "utf8"), + articleSlug, + }; +} + +export function buildWikipediaArticleUrl( + wikiSlug: string, + title: string, +): string { + const host = wikipediaHostFromWikiSlug(wikiSlug); + const encodedTitle = encodeURIComponent(title.replace(/ /g, "_")); + return `https://${host}/wiki/${encodedTitle}`; +} + +export function wikipediaHostFromWikiSlug(wikiSlug: string): string { + const project = wikiSlug.endsWith("wiki") ? wikiSlug.slice(0, -4) : wikiSlug; + return `${project}.wikipedia.org`; +} + +function truncateUtf8( + input: string, + maxBytes: number | undefined, +): { text: string; truncated: boolean } { + if (maxBytes === undefined || maxBytes <= 0) { + return { text: input, truncated: false }; + } + if (Buffer.byteLength(input, "utf8") <= maxBytes) { + return { text: input, truncated: false }; + } + + const suffix = "\n\n[Article truncated during Wikipedia import.]"; + const suffixBytes = Buffer.byteLength(suffix, "utf8"); + const contentBudget = Math.max(0, maxBytes - suffixBytes); + let low = 0; + let high = input.length; + while (low < high) { + const mid = Math.ceil((low + high) / 2); + if (Buffer.byteLength(input.slice(0, mid), "utf8") <= contentBudget) { + low = mid; + } else { + high = mid - 1; + } + } + + return { + text: `${input.slice(0, low).trimEnd()}${suffix}`, + truncated: true, + }; +} + +/** + * Stable UUIDv7 per Wikipedia page id. + * + * The id intentionally keys on page id rather than revision id so repeated + * imports of newer dumps do not create duplicate memories for the same article. + * The current revision id remains available in metadata. + */ +export function deterministicWikipediaPageUuidV7( + wikiSlug: string, + pageId: string, +): string { + const bytes = new Uint8Array(16); + const timestampMs = WIKIPEDIA_LAUNCH_TIMESTAMP_MS; + bytes[0] = Math.floor(timestampMs / 2 ** 40) & 0xff; + bytes[1] = Math.floor(timestampMs / 2 ** 32) & 0xff; + bytes[2] = Math.floor(timestampMs / 2 ** 24) & 0xff; + bytes[3] = Math.floor(timestampMs / 2 ** 16) & 0xff; + bytes[4] = Math.floor(timestampMs / 2 ** 8) & 0xff; + bytes[5] = timestampMs & 0xff; + + const digest = createHash("sha256") + .update(`wikipedia:${wikiSlug}:${pageId}`, "utf8") + .digest(); + const randA = ((digest[0] ?? 0) << 8) | (digest[1] ?? 0); + bytes[6] = 0x70 | ((randA >> 8) & 0x0f); + bytes[7] = randA & 0xff; + bytes[8] = 0x80 | ((digest[2] ?? 0) & 0x3f); + for (let i = 0; i < 7; i++) { + bytes[9 + i] = digest[3 + i] ?? 0; + } + + return bytesToUuid(bytes); +} + +function bytesToUuid(bytes: Uint8Array): string { + const hex: string[] = []; + for (let i = 0; i < 16; i++) { + hex.push((bytes[i] ?? 0).toString(16).padStart(2, "0")); + } + return ( + `${hex.slice(0, 4).join("")}-` + + `${hex.slice(4, 6).join("")}-` + + `${hex.slice(6, 8).join("")}-` + + `${hex.slice(8, 10).join("")}-` + + `${hex.slice(10, 16).join("")}` + ); +}