Add metadata refresh command and jhalfs caching
This commit is contained in:
parent
3ce470e019
commit
12e6d41e58
5 changed files with 92 additions and 19 deletions
|
|
@ -120,6 +120,15 @@ cargo run --bin metadata_indexer -- \
|
||||||
--dry-run
|
--dry-run
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Keep the jhalfs manifests current with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cargo run --bin metadata_indexer -- --base-dir . refresh
|
||||||
|
```
|
||||||
|
|
||||||
|
Passing `--books mlfs,blfs` restricts the refresh to specific books, and
|
||||||
|
`--force` bypasses the local cache.
|
||||||
|
|
||||||
## 📚 Documentation
|
## 📚 Documentation
|
||||||
|
|
||||||
- [Architecture Overview](docs/ARCHITECTURE.md) – high-level tour of the crate
|
- [Architecture Overview](docs/ARCHITECTURE.md) – high-level tour of the crate
|
||||||
|
|
|
||||||
|
|
@ -51,6 +51,13 @@
|
||||||
"description": "Provide a standalone CLI to validate package metadata against the schema and regenerate ai/metadata/index.json.",
|
"description": "Provide a standalone CLI to validate package metadata against the schema and regenerate ai/metadata/index.json.",
|
||||||
"resolution": "Added src/bin/metadata_indexer.rs with schema validation, summary extraction, and index writer integration.",
|
"resolution": "Added src/bin/metadata_indexer.rs with schema validation, summary extraction, and index writer integration.",
|
||||||
"owner": "default_cli"
|
"owner": "default_cli"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "metadata-jhalfs-refresh",
|
||||||
|
"title": "Wire jhalfs manifests into metadata harvester",
|
||||||
|
"description": "Cache wget-list/md5sums from jhalfs and expose a CLI refresh command so harvesting can populate source URLs and checksums reliably.",
|
||||||
|
"resolution": "Extended metadata_indexer with a `refresh` subcommand, cached manifests under ai/metadata/cache/, and hooked harvest to populate MD5 checksums via jhalfs data.",
|
||||||
|
"owner": "default_cli"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -42,8 +42,11 @@ artifacts under `ai/metadata/`:
|
||||||
`ai/metadata/index.json` (use `--compact` for single-line JSON).
|
`ai/metadata/index.json` (use `--compact` for single-line JSON).
|
||||||
- `harvest` – fetches a given book page, extracts build metadata, and emits a
|
- `harvest` – fetches a given book page, extracts build metadata, and emits a
|
||||||
schema-compliant JSON skeleton. When direct HTML parsing does not locate the
|
schema-compliant JSON skeleton. When direct HTML parsing does not locate the
|
||||||
source tarball, it falls back to the jhalfs `wget-list` data to populate
|
source tarball, it falls back to cached jhalfs manifests to populate
|
||||||
`source.urls`.
|
`source.urls` and MD5 checksums.
|
||||||
|
- `refresh` – downloads (or re-downloads with `--force`) the jhalfs manifests
|
||||||
|
(`wget-list`, `md5sums`) for one or more books and stores them under
|
||||||
|
`ai/metadata/cache/`.
|
||||||
|
|
||||||
## Module layout
|
## Module layout
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,8 @@ This document explains the workflow and the supporting assets.
|
||||||
| ------- | ----------- |
|
| ------- | ----------- |
|
||||||
| `validate` | Loads every package JSON file and validates it against `schema.json`. Reports schema violations and summary extraction errors. |
|
| `validate` | Loads every package JSON file and validates it against `schema.json`. Reports schema violations and summary extraction errors. |
|
||||||
| `index` | Re-runs validation and regenerates `index.json`. Use `--compact` to write a single-line JSON payload. |
|
| `index` | Re-runs validation and regenerates `index.json`. Use `--compact` to write a single-line JSON payload. |
|
||||||
| `harvest` | Fetches a book page, scrapes build instructions, and emits a draft metadata record (to stdout with `--dry-run` or into `ai/metadata/packages/`). |
|
| `harvest` | Fetches a book page, scrapes build instructions, and emits a draft metadata record (to stdout with `--dry-run` or into `ai/metadata/packages/`). Falls back to jhalfs manifests when inline source links are absent. |
|
||||||
|
| `refresh` | Updates cached jhalfs manifests (`wget-list`, `md5sums`) under `ai/metadata/cache/`. Supports `--books` filtering and `--force` to bypass the cache. |
|
||||||
|
|
||||||
### Harvesting flow
|
### Harvesting flow
|
||||||
|
|
||||||
|
|
@ -32,11 +33,10 @@ This document explains the workflow and the supporting assets.
|
||||||
4. **Artifact stats** – `div.segmentedlist` entries supply SBU and disk usage.
|
4. **Artifact stats** – `div.segmentedlist` entries supply SBU and disk usage.
|
||||||
5. **Source URLs** – the harvester tries two strategies:
|
5. **Source URLs** – the harvester tries two strategies:
|
||||||
- Inline HTML links inside the page (common for BLFS articles).
|
- Inline HTML links inside the page (common for BLFS articles).
|
||||||
- Fallback to the jhalfs `wget-list` for the selected book (currently MLFS)
|
- Fallback to the cached jhalfs `wget-list` for the selected book to find
|
||||||
using `package-management::wget_list::get_wget_list` to find matching
|
matching `<package>-<version>` entries.
|
||||||
`<package>-<version>` entries.
|
6. **Checksums** – the matching entry from the cached jhalfs `md5sums`
|
||||||
6. **Checksums** – integration with the book’s `md5sums` mirror is pending;
|
manifest populates `source.checksums` when the archive name is known.
|
||||||
placeholder wiring exists (`src/md5_utils.rs`).
|
|
||||||
7. **Status** – unresolved items (missing URLs, anchors, etc.) are recorded in
|
7. **Status** – unresolved items (missing URLs, anchors, etc.) are recorded in
|
||||||
`status.issues` so humans can interrogate or patch the draft before
|
`status.issues` so humans can interrogate or patch the draft before
|
||||||
promoting it.
|
promoting it.
|
||||||
|
|
@ -46,8 +46,6 @@ This document explains the workflow and the supporting assets.
|
||||||
- **Source links via tables** – some MLFS chapters list download links inside a
|
- **Source links via tables** – some MLFS chapters list download links inside a
|
||||||
“Package Information” table. The current implementation relies on the
|
“Package Information” table. The current implementation relies on the
|
||||||
jhalfs `wget-list` fallback instead of parsing that table.
|
jhalfs `wget-list` fallback instead of parsing that table.
|
||||||
- **Checksums** – MD5 lookups from jhalfs are planned but not yet wired into
|
|
||||||
the harvest pipeline.
|
|
||||||
- **Anchor discovery** – if the heading lacks an explicit `id` attribute, the
|
- **Anchor discovery** – if the heading lacks an explicit `id` attribute, the
|
||||||
scraper attempts to locate child anchors or scan the raw HTML. If none are
|
scraper attempts to locate child anchors or scan the raw HTML. If none are
|
||||||
found, a warning is recorded and `status.issues` contains a reminder.
|
found, a warning is recorded and `status.issues` contains a reminder.
|
||||||
|
|
@ -55,17 +53,15 @@ This document explains the workflow and the supporting assets.
|
||||||
## Using jhalfs manifests
|
## Using jhalfs manifests
|
||||||
|
|
||||||
The maintained `wget-list`/`md5sums` files hosted by jhalfs provide canonical
|
The maintained `wget-list`/`md5sums` files hosted by jhalfs provide canonical
|
||||||
source URLs and hashes. The helper modules `src/wget_list.rs` and
|
source URLs and hashes. The `metadata_indexer refresh` command keeps these
|
||||||
`src/md5_utils.rs` download these lists for the multilib LFS book. The
|
manifests cached under `ai/metadata/cache/`. Harvesting consumes the cached
|
||||||
harvester currently consumes the wget-list as a fallback; integrating the
|
copies to populate URLs and MD5 checksums.
|
||||||
`md5sums` file will let us emit `source.checksums` automatically.
|
|
||||||
|
|
||||||
Planned enhancements (see `ai/notes.md` and `ai/bugs.json#metadata-harvest-no-source-urls`):
|
Planned enhancements (see `ai/notes.md` and `ai/bugs.json#metadata-harvest-no-source-urls`):
|
||||||
|
|
||||||
1. Abstract list fetching so BLFS/GLFS variants can reuse the logic.
|
1. Abstract list fetching so BLFS/GLFS variants can reuse the logic.
|
||||||
2. Normalise the match criteria for package + version (handling pass stages,
|
2. Normalise the match criteria for package + version (handling pass stages,
|
||||||
suffixes, etc.).
|
suffixes, etc.).
|
||||||
3. Populate checksum entries alongside URLs.
|
|
||||||
|
|
||||||
## Manual review checklist
|
## Manual review checklist
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,15 @@ enum Command {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
compact: bool,
|
compact: bool,
|
||||||
},
|
},
|
||||||
|
/// Refresh cached jhalfs manifests for the given book(s)
|
||||||
|
Refresh {
|
||||||
|
/// Books to refresh (defaults to all known books)
|
||||||
|
#[arg(long, value_delimiter = ',', default_value = "mlfs,lfs,blfs,glfs")]
|
||||||
|
books: Vec<String>,
|
||||||
|
/// Force re-download even if cache files exist
|
||||||
|
#[arg(long)]
|
||||||
|
force: bool,
|
||||||
|
},
|
||||||
/// Fetch and draft metadata for a specific package page
|
/// Fetch and draft metadata for a specific package page
|
||||||
Harvest {
|
Harvest {
|
||||||
/// Book identifier (lfs, mlfs, blfs, glfs)
|
/// Book identifier (lfs, mlfs, blfs, glfs)
|
||||||
|
|
@ -181,6 +190,37 @@ fn main() -> Result<()> {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Command::Refresh { books, force } => {
|
||||||
|
let unique: HashSet<_> = books.into_iter().map(|b| b.to_lowercase()).collect();
|
||||||
|
let mut refreshed = 0usize;
|
||||||
|
for book in unique {
|
||||||
|
for kind in [ManifestKind::WgetList, ManifestKind::Md5Sums] {
|
||||||
|
match refresh_manifest(&metadata_dir, &book, kind, force) {
|
||||||
|
Ok(path) => {
|
||||||
|
refreshed += 1;
|
||||||
|
println!(
|
||||||
|
"Refreshed {} manifest for {} -> {}",
|
||||||
|
kind.description(),
|
||||||
|
book,
|
||||||
|
path.display()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
eprintln!(
|
||||||
|
"warning: failed to refresh {} manifest for {}: {}",
|
||||||
|
kind.description(),
|
||||||
|
book,
|
||||||
|
err
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if refreshed == 0 {
|
||||||
|
println!("No manifests refreshed (check warnings above).");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
@ -697,6 +737,7 @@ struct SourceUrlEntry {
|
||||||
kind: &'static str,
|
kind: &'static str,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
enum ManifestKind {
|
enum ManifestKind {
|
||||||
WgetList,
|
WgetList,
|
||||||
Md5Sums,
|
Md5Sums,
|
||||||
|
|
@ -709,6 +750,13 @@ impl ManifestKind {
|
||||||
ManifestKind::Md5Sums => "md5sums.txt",
|
ManifestKind::Md5Sums => "md5sums.txt",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
ManifestKind::WgetList => "wget-list",
|
||||||
|
ManifestKind::Md5Sums => "md5sums",
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_tarball_urls(page_url: &str, document: &Html) -> Vec<SourceUrlEntry> {
|
fn collect_tarball_urls(page_url: &str, document: &Html) -> Vec<SourceUrlEntry> {
|
||||||
|
|
@ -824,14 +872,24 @@ fn resolve_checksums(
|
||||||
}
|
}
|
||||||
|
|
||||||
fn load_jhalfs_manifest(metadata_dir: &Path, book: &str, kind: ManifestKind) -> Result<String> {
|
fn load_jhalfs_manifest(metadata_dir: &Path, book: &str, kind: ManifestKind) -> Result<String> {
|
||||||
|
let cache_path = refresh_manifest(metadata_dir, book, kind, false)?;
|
||||||
|
fs::read_to_string(&cache_path)
|
||||||
|
.with_context(|| format!("reading cached manifest {}", cache_path.display()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn refresh_manifest(
|
||||||
|
metadata_dir: &Path,
|
||||||
|
book: &str,
|
||||||
|
kind: ManifestKind,
|
||||||
|
force: bool,
|
||||||
|
) -> Result<PathBuf> {
|
||||||
let cache_dir = metadata_dir.join("cache");
|
let cache_dir = metadata_dir.join("cache");
|
||||||
fs::create_dir_all(&cache_dir)
|
fs::create_dir_all(&cache_dir)
|
||||||
.with_context(|| format!("creating cache directory {}", cache_dir.display()))?;
|
.with_context(|| format!("creating cache directory {}", cache_dir.display()))?;
|
||||||
|
|
||||||
let cache_path = cache_dir.join(format!("{}-{}", book, kind.filename()));
|
let cache_path = cache_dir.join(format!("{}-{}", book, kind.filename()));
|
||||||
if cache_path.exists() {
|
if cache_path.exists() && !force {
|
||||||
return fs::read_to_string(&cache_path)
|
return Ok(cache_path);
|
||||||
.with_context(|| format!("reading cached manifest {}", cache_path.display()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let url = manifest_url(book, &kind)
|
let url = manifest_url(book, &kind)
|
||||||
|
|
@ -850,7 +908,7 @@ fn load_jhalfs_manifest(metadata_dir: &Path, book: &str, kind: ManifestKind) ->
|
||||||
fs::write(&cache_path, &body)
|
fs::write(&cache_path, &body)
|
||||||
.with_context(|| format!("caching manifest {}", cache_path.display()))?;
|
.with_context(|| format!("caching manifest {}", cache_path.display()))?;
|
||||||
|
|
||||||
Ok(body)
|
Ok(cache_path)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn manifest_url(book: &str, kind: &ManifestKind) -> Option<&'static str> {
|
fn manifest_url(book: &str, kind: &ManifestKind) -> Option<&'static str> {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue