Add metadata generator and CLI integration
This commit is contained in:
parent
205ab25d41
commit
c19c5c21ab
6 changed files with 291 additions and 1 deletions
11
README.md
11
README.md
|
|
@ -129,6 +129,17 @@ cargo run --bin metadata_indexer -- --base-dir . refresh
|
||||||
Passing `--books mlfs,blfs` restricts the refresh to specific books, and
|
Passing `--books mlfs,blfs` restricts the refresh to specific books, and
|
||||||
`--force` bypasses the local cache.
|
`--force` bypasses the local cache.
|
||||||
|
|
||||||
|
To materialise a Rust module from harvested metadata:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cargo run --bin metadata_indexer -- \
|
||||||
|
--base-dir . generate \
|
||||||
|
--metadata ai/metadata/packages/mlfs/binutils-pass-1.json \
|
||||||
|
--output target/generated/by_name
|
||||||
|
```
|
||||||
|
|
||||||
|
Add `--overwrite` to regenerate an existing module directory.
|
||||||
|
|
||||||
## 📚 Documentation
|
## 📚 Documentation
|
||||||
|
|
||||||
- [Architecture Overview](docs/ARCHITECTURE.md) – high-level tour of the crate
|
- [Architecture Overview](docs/ARCHITECTURE.md) – high-level tour of the crate
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,9 @@
|
||||||
first when revisiting the project.
|
first when revisiting the project.
|
||||||
- `metadata_indexer` now supports a `refresh` command that pulls jhalfs
|
- `metadata_indexer` now supports a `refresh` command that pulls jhalfs
|
||||||
`wget-list`/`md5sums` manifests into `ai/metadata/cache/` and the `harvest`
|
`wget-list`/`md5sums` manifests into `ai/metadata/cache/` and the `harvest`
|
||||||
command automatically draws URLs and checksums from those manifests.
|
command automatically draws URLs and checksums from those manifests. A
|
||||||
|
`generate` subcommand consumes harvested metadata and scaffolds Rust modules
|
||||||
|
under `src/pkgs/by_name` (or a custom output directory).
|
||||||
- AI state lives under `ai/`:
|
- AI state lives under `ai/`:
|
||||||
- `ai/personas.json`, `ai/tasks.json`, `ai/bugs.json` track personas,
|
- `ai/personas.json`, `ai/tasks.json`, `ai/bugs.json` track personas,
|
||||||
outstanding work, and known issues.
|
outstanding work, and known issues.
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@ This document explains the workflow and the supporting assets.
|
||||||
| `index` | Re-runs validation and regenerates `index.json`. Use `--compact` to write a single-line JSON payload. |
|
| `index` | Re-runs validation and regenerates `index.json`. Use `--compact` to write a single-line JSON payload. |
|
||||||
| `harvest` | Fetches a book page, scrapes build instructions, and emits a draft metadata record (to stdout with `--dry-run` or into `ai/metadata/packages/`). Falls back to jhalfs manifests when inline source links are absent. |
|
| `harvest` | Fetches a book page, scrapes build instructions, and emits a draft metadata record (to stdout with `--dry-run` or into `ai/metadata/packages/`). Falls back to jhalfs manifests when inline source links are absent. |
|
||||||
| `refresh` | Updates cached jhalfs manifests (`wget-list`, `md5sums`) under `ai/metadata/cache/`. Supports `--books` filtering and `--force` to bypass the cache. |
|
| `refresh` | Updates cached jhalfs manifests (`wget-list`, `md5sums`) under `ai/metadata/cache/`. Supports `--books` filtering and `--force` to bypass the cache. |
|
||||||
|
| `generate` | Translates harvested metadata into Rust modules under `src/pkgs/by_name` (or a specified directory), using the scaffolder to create `PackageDefinition` wrappers. |
|
||||||
|
|
||||||
### Harvesting flow
|
### Harvesting flow
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,8 @@ use serde_json::{Value, json};
|
||||||
use sha2::{Digest, Sha256};
|
use sha2::{Digest, Sha256};
|
||||||
use walkdir::WalkDir;
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
|
use package_management::pkgs::generator;
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
#[command(
|
#[command(
|
||||||
name = "metadata-indexer",
|
name = "metadata-indexer",
|
||||||
|
|
@ -64,6 +66,18 @@ enum Command {
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
dry_run: bool,
|
dry_run: bool,
|
||||||
},
|
},
|
||||||
|
/// Generate Rust modules from harvested metadata
|
||||||
|
Generate {
|
||||||
|
/// Path to the harvested metadata JSON file
|
||||||
|
#[arg(long)]
|
||||||
|
metadata: PathBuf,
|
||||||
|
/// Output directory (should be the `by_name` root)
|
||||||
|
#[arg(long, default_value = "src/pkgs/by_name")]
|
||||||
|
output: PathBuf,
|
||||||
|
/// Remove existing module directory before regeneration
|
||||||
|
#[arg(long)]
|
||||||
|
overwrite: bool,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
|
|
@ -221,6 +235,31 @@ fn main() -> Result<()> {
|
||||||
println!("No manifests refreshed (check warnings above).");
|
println!("No manifests refreshed (check warnings above).");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Command::Generate {
|
||||||
|
metadata,
|
||||||
|
output,
|
||||||
|
overwrite,
|
||||||
|
} => {
|
||||||
|
if overwrite {
|
||||||
|
match generator::module_directory(&metadata, &output) {
|
||||||
|
Ok(dir) if dir.exists() => {
|
||||||
|
fs::remove_dir_all(&dir).with_context(|| {
|
||||||
|
format!("removing existing module {}", dir.display())
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
Ok(_) => {}
|
||||||
|
Err(err) => {
|
||||||
|
eprintln!(
|
||||||
|
"warning: could not determine existing module directory: {}",
|
||||||
|
err
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let module_path = generator::generate_module(&metadata, &output)?;
|
||||||
|
println!("Generated module at {}", module_path.display());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
|
||||||
236
src/pkgs/generator.rs
Normal file
236
src/pkgs/generator.rs
Normal file
|
|
@ -0,0 +1,236 @@
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::fs;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
use anyhow::{Context, Result, anyhow};
|
||||||
|
use serde::Deserialize;
|
||||||
|
|
||||||
|
use crate::pkgs::scaffolder::{self, ScaffoldRequest};
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct HarvestedPackage {
|
||||||
|
package: HarvestedMetadata,
|
||||||
|
source: HarvestedSource,
|
||||||
|
#[serde(default)]
|
||||||
|
build: Vec<CommandPhase>,
|
||||||
|
#[serde(default)]
|
||||||
|
dependencies: Option<HarvestedDependencies>,
|
||||||
|
optimizations: HarvestedOptimisations,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct HarvestedMetadata {
|
||||||
|
id: String,
|
||||||
|
name: String,
|
||||||
|
version: String,
|
||||||
|
#[serde(default)]
|
||||||
|
stage: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
variant: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
notes: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct HarvestedSource {
|
||||||
|
#[serde(default)]
|
||||||
|
archive: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
urls: Vec<HarvestedUrl>,
|
||||||
|
#[serde(default)]
|
||||||
|
checksums: Vec<HarvestedChecksum>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct HarvestedUrl {
|
||||||
|
url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct HarvestedChecksum {
|
||||||
|
alg: String,
|
||||||
|
value: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct HarvestedOptimisations {
|
||||||
|
enable_lto: bool,
|
||||||
|
enable_pgo: bool,
|
||||||
|
#[serde(default)]
|
||||||
|
cflags: Vec<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
ldflags: Vec<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
profdata: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct CommandPhase {
|
||||||
|
#[serde(default)]
|
||||||
|
phase: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
commands: Vec<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
cwd: Option<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
requires_root: Option<bool>,
|
||||||
|
#[serde(default)]
|
||||||
|
notes: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct HarvestedDependencies {
|
||||||
|
#[serde(default)]
|
||||||
|
build: Vec<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
runtime: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate a Rust module from harvested metadata, returning the path to the generated file.
|
||||||
|
pub fn generate_module(
|
||||||
|
metadata_path: impl AsRef<Path>,
|
||||||
|
base_dir: impl AsRef<Path>,
|
||||||
|
) -> Result<PathBuf> {
|
||||||
|
let harvested = parse_metadata(metadata_path.as_ref())?;
|
||||||
|
let request = build_request(&harvested)?;
|
||||||
|
let result = scaffolder::scaffold_package(base_dir.as_ref(), request)?;
|
||||||
|
Ok(result.module_path)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute the directory for a module derived from the given metadata.
|
||||||
|
pub fn module_directory(
|
||||||
|
metadata_path: impl AsRef<Path>,
|
||||||
|
base_dir: impl AsRef<Path>,
|
||||||
|
) -> Result<PathBuf> {
|
||||||
|
let harvested = parse_metadata(metadata_path.as_ref())?;
|
||||||
|
let slug = module_override_from_id(&harvested.package.id).ok_or_else(|| {
|
||||||
|
anyhow!(
|
||||||
|
"unable to derive module slug from id '{}'",
|
||||||
|
harvested.package.id
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
let module = sanitize_module_name(&slug);
|
||||||
|
let dir = base_dir
|
||||||
|
.as_ref()
|
||||||
|
.join(prefix_from_module(&module))
|
||||||
|
.join(module);
|
||||||
|
Ok(dir)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_request(pkg: &HarvestedPackage) -> Result<ScaffoldRequest> {
|
||||||
|
let slug = module_override_from_id(&pkg.package.id)
|
||||||
|
.ok_or_else(|| anyhow!("unable to derive module slug from id '{}'", pkg.package.id))?;
|
||||||
|
|
||||||
|
let mut build_commands = Vec::new();
|
||||||
|
let mut install_commands = Vec::new();
|
||||||
|
for command in flatten_commands(&pkg.build) {
|
||||||
|
if command.contains("make install") {
|
||||||
|
install_commands.push(command);
|
||||||
|
} else {
|
||||||
|
build_commands.push(command);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut dependencies = HashSet::new();
|
||||||
|
if let Some(deps) = &pkg.dependencies {
|
||||||
|
for dep in &deps.build {
|
||||||
|
dependencies.insert(dep.clone());
|
||||||
|
}
|
||||||
|
for dep in &deps.runtime {
|
||||||
|
dependencies.insert(dep.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mut dependencies: Vec<String> = dependencies.into_iter().collect();
|
||||||
|
dependencies.sort();
|
||||||
|
|
||||||
|
let request = ScaffoldRequest {
|
||||||
|
name: pkg.package.name.clone(),
|
||||||
|
version: pkg.package.version.clone(),
|
||||||
|
source: pkg.source.urls.first().map(|u| u.url.clone()),
|
||||||
|
md5: pkg
|
||||||
|
.source
|
||||||
|
.checksums
|
||||||
|
.iter()
|
||||||
|
.find(|c| c.alg.eq_ignore_ascii_case("md5"))
|
||||||
|
.map(|c| c.value.clone()),
|
||||||
|
configure_args: Vec::new(),
|
||||||
|
build_commands,
|
||||||
|
install_commands,
|
||||||
|
dependencies,
|
||||||
|
enable_lto: pkg.optimizations.enable_lto,
|
||||||
|
enable_pgo: pkg.optimizations.enable_pgo,
|
||||||
|
cflags: pkg.optimizations.cflags.clone(),
|
||||||
|
ldflags: pkg.optimizations.ldflags.clone(),
|
||||||
|
profdata: pkg.optimizations.profdata.clone(),
|
||||||
|
stage: pkg.package.stage.clone(),
|
||||||
|
variant: pkg.package.variant.clone(),
|
||||||
|
notes: pkg.package.notes.clone(),
|
||||||
|
module_override: Some(slug),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(request)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flatten_commands(phases: &[CommandPhase]) -> Vec<String> {
|
||||||
|
phases
|
||||||
|
.iter()
|
||||||
|
.flat_map(|phase| phase.commands.iter().cloned())
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn module_override_from_id(id: &str) -> Option<String> {
|
||||||
|
let slug = match id.split_once('/') {
|
||||||
|
Some((_, slug)) => slug,
|
||||||
|
None => id,
|
||||||
|
};
|
||||||
|
Some(
|
||||||
|
slug.replace('.', "_")
|
||||||
|
.replace('/', "_")
|
||||||
|
.replace('-', "_")
|
||||||
|
.replace(' ', "_")
|
||||||
|
.to_lowercase(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_metadata(path: &Path) -> Result<HarvestedPackage> {
|
||||||
|
let metadata = fs::read_to_string(path)
|
||||||
|
.with_context(|| format!("reading metadata file {}", path.display()))?;
|
||||||
|
let harvested: HarvestedPackage = serde_json::from_str(&metadata)
|
||||||
|
.with_context(|| format!("parsing harvested metadata from {}", path.display()))?;
|
||||||
|
Ok(harvested)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sanitize_module_name(name: &str) -> String {
|
||||||
|
let mut out = String::new();
|
||||||
|
for ch in name.chars() {
|
||||||
|
if ch.is_ascii_alphanumeric() {
|
||||||
|
out.push(ch.to_ascii_lowercase());
|
||||||
|
} else if ch == '_' || ch == '+' || ch == '-' {
|
||||||
|
out.push('_');
|
||||||
|
} else {
|
||||||
|
out.push('_');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if out.is_empty() {
|
||||||
|
out.push_str("pkg");
|
||||||
|
}
|
||||||
|
if out
|
||||||
|
.chars()
|
||||||
|
.next()
|
||||||
|
.map(|c| c.is_ascii_digit())
|
||||||
|
.unwrap_or(false)
|
||||||
|
{
|
||||||
|
out.insert(0, 'p');
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
|
fn prefix_from_module(module: &str) -> String {
|
||||||
|
let mut chars = module.chars();
|
||||||
|
let first = chars.next().unwrap_or('p');
|
||||||
|
let second = chars.next().unwrap_or('k');
|
||||||
|
let mut s = String::new();
|
||||||
|
s.push(first);
|
||||||
|
s.push(second);
|
||||||
|
s
|
||||||
|
}
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
pub mod by_name;
|
pub mod by_name;
|
||||||
|
pub mod generator;
|
||||||
pub mod mlfs;
|
pub mod mlfs;
|
||||||
pub mod package;
|
pub mod package;
|
||||||
pub mod scaffolder;
|
pub mod scaffolder;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue