diff --git a/README.md b/README.md index 83de895..8eedfe7 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,15 @@ cargo run --bin metadata_indexer -- \ --dry-run ``` +Keep the jhalfs manifests current with: + +```bash +cargo run --bin metadata_indexer -- --base-dir . refresh +``` + +Passing `--books mlfs,blfs` restricts the refresh to specific books, and +`--force` bypasses the local cache. + ## 📚 Documentation - [Architecture Overview](docs/ARCHITECTURE.md) – high-level tour of the crate diff --git a/ai/tasks.json b/ai/tasks.json index abb56e2..c7fcb2f 100644 --- a/ai/tasks.json +++ b/ai/tasks.json @@ -51,6 +51,13 @@ "description": "Provide a standalone CLI to validate package metadata against the schema and regenerate ai/metadata/index.json.", "resolution": "Added src/bin/metadata_indexer.rs with schema validation, summary extraction, and index writer integration.", "owner": "default_cli" + }, + { + "id": "metadata-jhalfs-refresh", + "title": "Wire jhalfs manifests into metadata harvester", + "description": "Cache wget-list/md5sums from jhalfs and expose a CLI refresh command so harvesting can populate source URLs and checksums reliably.", + "resolution": "Extended metadata_indexer with a `refresh` subcommand, cached manifests under ai/metadata/cache/, and hooked harvest to populate MD5 checksums via jhalfs data.", + "owner": "default_cli" } ] } diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 19f9fb8..8fd1a96 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -42,8 +42,11 @@ artifacts under `ai/metadata/`: `ai/metadata/index.json` (use `--compact` for single-line JSON). - `harvest` – fetches a given book page, extracts build metadata, and emits a schema-compliant JSON skeleton. When direct HTML parsing does not locate the - source tarball, it falls back to the jhalfs `wget-list` data to populate - `source.urls`. + source tarball, it falls back to cached jhalfs manifests to populate + `source.urls` and MD5 checksums. +- `refresh` – downloads (or re-downloads with `--force`) the jhalfs manifests + (`wget-list`, `md5sums`) for one or more books and stores them under + `ai/metadata/cache/`. ## Module layout diff --git a/docs/METADATA_PIPELINE.md b/docs/METADATA_PIPELINE.md index 895031c..10ee366 100644 --- a/docs/METADATA_PIPELINE.md +++ b/docs/METADATA_PIPELINE.md @@ -19,7 +19,8 @@ This document explains the workflow and the supporting assets. | ------- | ----------- | | `validate` | Loads every package JSON file and validates it against `schema.json`. Reports schema violations and summary extraction errors. | | `index` | Re-runs validation and regenerates `index.json`. Use `--compact` to write a single-line JSON payload. | -| `harvest` | Fetches a book page, scrapes build instructions, and emits a draft metadata record (to stdout with `--dry-run` or into `ai/metadata/packages/`). | +| `harvest` | Fetches a book page, scrapes build instructions, and emits a draft metadata record (to stdout with `--dry-run` or into `ai/metadata/packages/`). Falls back to jhalfs manifests when inline source links are absent. | +| `refresh` | Updates cached jhalfs manifests (`wget-list`, `md5sums`) under `ai/metadata/cache/`. Supports `--books` filtering and `--force` to bypass the cache. | ### Harvesting flow @@ -32,11 +33,10 @@ This document explains the workflow and the supporting assets. 4. **Artifact stats** – `div.segmentedlist` entries supply SBU and disk usage. 5. **Source URLs** – the harvester tries two strategies: - Inline HTML links inside the page (common for BLFS articles). - - Fallback to the jhalfs `wget-list` for the selected book (currently MLFS) - using `package-management::wget_list::get_wget_list` to find matching - `-` entries. -6. **Checksums** – integration with the book’s `md5sums` mirror is pending; - placeholder wiring exists (`src/md5_utils.rs`). + - Fallback to the cached jhalfs `wget-list` for the selected book to find + matching `-` entries. +6. **Checksums** – the matching entry from the cached jhalfs `md5sums` + manifest populates `source.checksums` when the archive name is known. 7. **Status** – unresolved items (missing URLs, anchors, etc.) are recorded in `status.issues` so humans can interrogate or patch the draft before promoting it. @@ -46,8 +46,6 @@ This document explains the workflow and the supporting assets. - **Source links via tables** – some MLFS chapters list download links inside a “Package Information” table. The current implementation relies on the jhalfs `wget-list` fallback instead of parsing that table. -- **Checksums** – MD5 lookups from jhalfs are planned but not yet wired into - the harvest pipeline. - **Anchor discovery** – if the heading lacks an explicit `id` attribute, the scraper attempts to locate child anchors or scan the raw HTML. If none are found, a warning is recorded and `status.issues` contains a reminder. @@ -55,17 +53,15 @@ This document explains the workflow and the supporting assets. ## Using jhalfs manifests The maintained `wget-list`/`md5sums` files hosted by jhalfs provide canonical -source URLs and hashes. The helper modules `src/wget_list.rs` and -`src/md5_utils.rs` download these lists for the multilib LFS book. The -harvester currently consumes the wget-list as a fallback; integrating the -`md5sums` file will let us emit `source.checksums` automatically. +source URLs and hashes. The `metadata_indexer refresh` command keeps these +manifests cached under `ai/metadata/cache/`. Harvesting consumes the cached +copies to populate URLs and MD5 checksums. Planned enhancements (see `ai/notes.md` and `ai/bugs.json#metadata-harvest-no-source-urls`): 1. Abstract list fetching so BLFS/GLFS variants can reuse the logic. 2. Normalise the match criteria for package + version (handling pass stages, suffixes, etc.). -3. Populate checksum entries alongside URLs. ## Manual review checklist diff --git a/src/bin/metadata_indexer.rs b/src/bin/metadata_indexer.rs index f058903..3ba638c 100644 --- a/src/bin/metadata_indexer.rs +++ b/src/bin/metadata_indexer.rs @@ -37,6 +37,15 @@ enum Command { #[arg(long)] compact: bool, }, + /// Refresh cached jhalfs manifests for the given book(s) + Refresh { + /// Books to refresh (defaults to all known books) + #[arg(long, value_delimiter = ',', default_value = "mlfs,lfs,blfs,glfs")] + books: Vec, + /// Force re-download even if cache files exist + #[arg(long)] + force: bool, + }, /// Fetch and draft metadata for a specific package page Harvest { /// Book identifier (lfs, mlfs, blfs, glfs) @@ -181,6 +190,37 @@ fn main() -> Result<()> { ); } } + Command::Refresh { books, force } => { + let unique: HashSet<_> = books.into_iter().map(|b| b.to_lowercase()).collect(); + let mut refreshed = 0usize; + for book in unique { + for kind in [ManifestKind::WgetList, ManifestKind::Md5Sums] { + match refresh_manifest(&metadata_dir, &book, kind, force) { + Ok(path) => { + refreshed += 1; + println!( + "Refreshed {} manifest for {} -> {}", + kind.description(), + book, + path.display() + ); + } + Err(err) => { + eprintln!( + "warning: failed to refresh {} manifest for {}: {}", + kind.description(), + book, + err + ); + } + } + } + } + + if refreshed == 0 { + println!("No manifests refreshed (check warnings above)."); + } + } } Ok(()) @@ -697,6 +737,7 @@ struct SourceUrlEntry { kind: &'static str, } +#[derive(Clone, Copy)] enum ManifestKind { WgetList, Md5Sums, @@ -709,6 +750,13 @@ impl ManifestKind { ManifestKind::Md5Sums => "md5sums.txt", } } + + fn description(&self) -> &'static str { + match self { + ManifestKind::WgetList => "wget-list", + ManifestKind::Md5Sums => "md5sums", + } + } } fn collect_tarball_urls(page_url: &str, document: &Html) -> Vec { @@ -824,14 +872,24 @@ fn resolve_checksums( } fn load_jhalfs_manifest(metadata_dir: &Path, book: &str, kind: ManifestKind) -> Result { + let cache_path = refresh_manifest(metadata_dir, book, kind, false)?; + fs::read_to_string(&cache_path) + .with_context(|| format!("reading cached manifest {}", cache_path.display())) +} + +fn refresh_manifest( + metadata_dir: &Path, + book: &str, + kind: ManifestKind, + force: bool, +) -> Result { let cache_dir = metadata_dir.join("cache"); fs::create_dir_all(&cache_dir) .with_context(|| format!("creating cache directory {}", cache_dir.display()))?; let cache_path = cache_dir.join(format!("{}-{}", book, kind.filename())); - if cache_path.exists() { - return fs::read_to_string(&cache_path) - .with_context(|| format!("reading cached manifest {}", cache_path.display())); + if cache_path.exists() && !force { + return Ok(cache_path); } let url = manifest_url(book, &kind) @@ -850,7 +908,7 @@ fn load_jhalfs_manifest(metadata_dir: &Path, book: &str, kind: ManifestKind) -> fs::write(&cache_path, &body) .with_context(|| format!("caching manifest {}", cache_path.display()))?; - Ok(body) + Ok(cache_path) } fn manifest_url(book: &str, kind: &ManifestKind) -> Option<&'static str> {