From c19c5c21ab994ad1c81e7cf63e3bbb96eb783d33 Mon Sep 17 00:00:00 2001 From: m00d Date: Wed, 1 Oct 2025 07:26:20 +0200 Subject: [PATCH] Add metadata generator and CLI integration --- README.md | 11 ++ ai/context.md | 4 +- docs/METADATA_PIPELINE.md | 1 + src/bin/metadata_indexer.rs | 39 ++++++ src/pkgs/generator.rs | 236 ++++++++++++++++++++++++++++++++++++ src/pkgs/mod.rs | 1 + 6 files changed, 291 insertions(+), 1 deletion(-) create mode 100644 src/pkgs/generator.rs diff --git a/README.md b/README.md index 8eedfe7..745642a 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,17 @@ cargo run --bin metadata_indexer -- --base-dir . refresh Passing `--books mlfs,blfs` restricts the refresh to specific books, and `--force` bypasses the local cache. +To materialise a Rust module from harvested metadata: + +```bash +cargo run --bin metadata_indexer -- \ + --base-dir . generate \ + --metadata ai/metadata/packages/mlfs/binutils-pass-1.json \ + --output target/generated/by_name +``` + +Add `--overwrite` to regenerate an existing module directory. + ## 📚 Documentation - [Architecture Overview](docs/ARCHITECTURE.md) – high-level tour of the crate diff --git a/ai/context.md b/ai/context.md index 59111b9..bf67ce1 100644 --- a/ai/context.md +++ b/ai/context.md @@ -5,7 +5,9 @@ first when revisiting the project. - `metadata_indexer` now supports a `refresh` command that pulls jhalfs `wget-list`/`md5sums` manifests into `ai/metadata/cache/` and the `harvest` - command automatically draws URLs and checksums from those manifests. + command automatically draws URLs and checksums from those manifests. A + `generate` subcommand consumes harvested metadata and scaffolds Rust modules + under `src/pkgs/by_name` (or a custom output directory). - AI state lives under `ai/`: - `ai/personas.json`, `ai/tasks.json`, `ai/bugs.json` track personas, outstanding work, and known issues. diff --git a/docs/METADATA_PIPELINE.md b/docs/METADATA_PIPELINE.md index 10ee366..dfc4197 100644 --- a/docs/METADATA_PIPELINE.md +++ b/docs/METADATA_PIPELINE.md @@ -21,6 +21,7 @@ This document explains the workflow and the supporting assets. | `index` | Re-runs validation and regenerates `index.json`. Use `--compact` to write a single-line JSON payload. | | `harvest` | Fetches a book page, scrapes build instructions, and emits a draft metadata record (to stdout with `--dry-run` or into `ai/metadata/packages/`). Falls back to jhalfs manifests when inline source links are absent. | | `refresh` | Updates cached jhalfs manifests (`wget-list`, `md5sums`) under `ai/metadata/cache/`. Supports `--books` filtering and `--force` to bypass the cache. | +| `generate` | Translates harvested metadata into Rust modules under `src/pkgs/by_name` (or a specified directory), using the scaffolder to create `PackageDefinition` wrappers. | ### Harvesting flow diff --git a/src/bin/metadata_indexer.rs b/src/bin/metadata_indexer.rs index 3ba638c..13fe78c 100644 --- a/src/bin/metadata_indexer.rs +++ b/src/bin/metadata_indexer.rs @@ -13,6 +13,8 @@ use serde_json::{Value, json}; use sha2::{Digest, Sha256}; use walkdir::WalkDir; +use package_management::pkgs::generator; + #[derive(Parser)] #[command( name = "metadata-indexer", @@ -64,6 +66,18 @@ enum Command { #[arg(long)] dry_run: bool, }, + /// Generate Rust modules from harvested metadata + Generate { + /// Path to the harvested metadata JSON file + #[arg(long)] + metadata: PathBuf, + /// Output directory (should be the `by_name` root) + #[arg(long, default_value = "src/pkgs/by_name")] + output: PathBuf, + /// Remove existing module directory before regeneration + #[arg(long)] + overwrite: bool, + }, } fn main() -> Result<()> { @@ -221,6 +235,31 @@ fn main() -> Result<()> { println!("No manifests refreshed (check warnings above)."); } } + Command::Generate { + metadata, + output, + overwrite, + } => { + if overwrite { + match generator::module_directory(&metadata, &output) { + Ok(dir) if dir.exists() => { + fs::remove_dir_all(&dir).with_context(|| { + format!("removing existing module {}", dir.display()) + })?; + } + Ok(_) => {} + Err(err) => { + eprintln!( + "warning: could not determine existing module directory: {}", + err + ); + } + } + } + + let module_path = generator::generate_module(&metadata, &output)?; + println!("Generated module at {}", module_path.display()); + } } Ok(()) diff --git a/src/pkgs/generator.rs b/src/pkgs/generator.rs new file mode 100644 index 0000000..a215b3c --- /dev/null +++ b/src/pkgs/generator.rs @@ -0,0 +1,236 @@ +use std::collections::HashSet; +use std::fs; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result, anyhow}; +use serde::Deserialize; + +use crate::pkgs::scaffolder::{self, ScaffoldRequest}; + +#[derive(Debug, Deserialize)] +struct HarvestedPackage { + package: HarvestedMetadata, + source: HarvestedSource, + #[serde(default)] + build: Vec, + #[serde(default)] + dependencies: Option, + optimizations: HarvestedOptimisations, +} + +#[derive(Debug, Deserialize)] +struct HarvestedMetadata { + id: String, + name: String, + version: String, + #[serde(default)] + stage: Option, + #[serde(default)] + variant: Option, + #[serde(default)] + notes: Option, +} + +#[derive(Debug, Deserialize)] +struct HarvestedSource { + #[serde(default)] + archive: Option, + #[serde(default)] + urls: Vec, + #[serde(default)] + checksums: Vec, +} + +#[derive(Debug, Deserialize)] +struct HarvestedUrl { + url: String, +} + +#[derive(Debug, Deserialize)] +struct HarvestedChecksum { + alg: String, + value: String, +} + +#[derive(Debug, Deserialize)] +struct HarvestedOptimisations { + enable_lto: bool, + enable_pgo: bool, + #[serde(default)] + cflags: Vec, + #[serde(default)] + ldflags: Vec, + #[serde(default)] + profdata: Option, +} + +#[derive(Debug, Deserialize)] +struct CommandPhase { + #[serde(default)] + phase: Option, + #[serde(default)] + commands: Vec, + #[serde(default)] + cwd: Option, + #[serde(default)] + requires_root: Option, + #[serde(default)] + notes: Option, +} + +#[derive(Debug, Deserialize)] +struct HarvestedDependencies { + #[serde(default)] + build: Vec, + #[serde(default)] + runtime: Vec, +} + +/// Generate a Rust module from harvested metadata, returning the path to the generated file. +pub fn generate_module( + metadata_path: impl AsRef, + base_dir: impl AsRef, +) -> Result { + let harvested = parse_metadata(metadata_path.as_ref())?; + let request = build_request(&harvested)?; + let result = scaffolder::scaffold_package(base_dir.as_ref(), request)?; + Ok(result.module_path) +} + +/// Compute the directory for a module derived from the given metadata. +pub fn module_directory( + metadata_path: impl AsRef, + base_dir: impl AsRef, +) -> Result { + let harvested = parse_metadata(metadata_path.as_ref())?; + let slug = module_override_from_id(&harvested.package.id).ok_or_else(|| { + anyhow!( + "unable to derive module slug from id '{}'", + harvested.package.id + ) + })?; + let module = sanitize_module_name(&slug); + let dir = base_dir + .as_ref() + .join(prefix_from_module(&module)) + .join(module); + Ok(dir) +} + +fn build_request(pkg: &HarvestedPackage) -> Result { + let slug = module_override_from_id(&pkg.package.id) + .ok_or_else(|| anyhow!("unable to derive module slug from id '{}'", pkg.package.id))?; + + let mut build_commands = Vec::new(); + let mut install_commands = Vec::new(); + for command in flatten_commands(&pkg.build) { + if command.contains("make install") { + install_commands.push(command); + } else { + build_commands.push(command); + } + } + + let mut dependencies = HashSet::new(); + if let Some(deps) = &pkg.dependencies { + for dep in &deps.build { + dependencies.insert(dep.clone()); + } + for dep in &deps.runtime { + dependencies.insert(dep.clone()); + } + } + let mut dependencies: Vec = dependencies.into_iter().collect(); + dependencies.sort(); + + let request = ScaffoldRequest { + name: pkg.package.name.clone(), + version: pkg.package.version.clone(), + source: pkg.source.urls.first().map(|u| u.url.clone()), + md5: pkg + .source + .checksums + .iter() + .find(|c| c.alg.eq_ignore_ascii_case("md5")) + .map(|c| c.value.clone()), + configure_args: Vec::new(), + build_commands, + install_commands, + dependencies, + enable_lto: pkg.optimizations.enable_lto, + enable_pgo: pkg.optimizations.enable_pgo, + cflags: pkg.optimizations.cflags.clone(), + ldflags: pkg.optimizations.ldflags.clone(), + profdata: pkg.optimizations.profdata.clone(), + stage: pkg.package.stage.clone(), + variant: pkg.package.variant.clone(), + notes: pkg.package.notes.clone(), + module_override: Some(slug), + }; + + Ok(request) +} + +fn flatten_commands(phases: &[CommandPhase]) -> Vec { + phases + .iter() + .flat_map(|phase| phase.commands.iter().cloned()) + .collect() +} + +fn module_override_from_id(id: &str) -> Option { + let slug = match id.split_once('/') { + Some((_, slug)) => slug, + None => id, + }; + Some( + slug.replace('.', "_") + .replace('/', "_") + .replace('-', "_") + .replace(' ', "_") + .to_lowercase(), + ) +} + +fn parse_metadata(path: &Path) -> Result { + let metadata = fs::read_to_string(path) + .with_context(|| format!("reading metadata file {}", path.display()))?; + let harvested: HarvestedPackage = serde_json::from_str(&metadata) + .with_context(|| format!("parsing harvested metadata from {}", path.display()))?; + Ok(harvested) +} + +fn sanitize_module_name(name: &str) -> String { + let mut out = String::new(); + for ch in name.chars() { + if ch.is_ascii_alphanumeric() { + out.push(ch.to_ascii_lowercase()); + } else if ch == '_' || ch == '+' || ch == '-' { + out.push('_'); + } else { + out.push('_'); + } + } + if out.is_empty() { + out.push_str("pkg"); + } + if out + .chars() + .next() + .map(|c| c.is_ascii_digit()) + .unwrap_or(false) + { + out.insert(0, 'p'); + } + out +} + +fn prefix_from_module(module: &str) -> String { + let mut chars = module.chars(); + let first = chars.next().unwrap_or('p'); + let second = chars.next().unwrap_or('k'); + let mut s = String::new(); + s.push(first); + s.push(second); + s +} diff --git a/src/pkgs/mod.rs b/src/pkgs/mod.rs index 90957bc..d7c40a7 100644 --- a/src/pkgs/mod.rs +++ b/src/pkgs/mod.rs @@ -1,4 +1,5 @@ pub mod by_name; +pub mod generator; pub mod mlfs; pub mod package; pub mod scaffolder;