use std::collections::HashSet; use std::fs; use std::path::{Path, PathBuf}; use anyhow::{Context, Result}; use chrono::Utc; use clap::{Parser, Subcommand}; use jsonschema::JSONSchema; use regex::Regex; use reqwest::{blocking::Client, redirect::Policy}; use scraper::{ElementRef, Html, Selector}; use serde_json::{Value, json}; use sha2::{Digest, Sha256}; use walkdir::WalkDir; use package_management::pkgs::generator; #[derive(Parser)] #[command( name = "metadata-indexer", about = "Validate and regenerate AI metadata index" )] struct Cli { /// Repository root containing the `ai/metadata` directory #[arg(long, default_value = ".")] base_dir: PathBuf, #[command(subcommand)] command: Command, } #[derive(Subcommand)] enum Command { /// Validate all package metadata against the JSON schema Validate, /// Validate metadata and regenerate ai/metadata/index.json Index { /// Emit compact JSON instead of pretty printing #[arg(long)] compact: bool, }, /// Refresh cached jhalfs manifests for the given book(s) Refresh { /// Books to refresh (defaults to all known books) #[arg(long, value_delimiter = ',', default_value = "mlfs,lfs,blfs,glfs")] books: Vec, /// Force re-download even if cache files exist #[arg(long)] force: bool, }, /// Fetch and draft metadata for a specific package page Harvest { /// Book identifier (lfs, mlfs, blfs, glfs) #[arg(long)] book: String, /// Page path (relative to base) or full URL #[arg(long)] page: String, /// Override base URL for the selected book #[arg(long)] base_url: Option, /// Optional explicit output file path #[arg(long)] output: Option, /// Do not write to disk, just print JSON to stdout #[arg(long)] dry_run: bool, }, /// Generate Rust modules from harvested metadata Generate { /// Path to the harvested metadata JSON file #[arg(long)] metadata: PathBuf, /// Output directory (should be the `by_name` root) #[arg(long, default_value = "src/pkgs/by_name")] output: PathBuf, /// Remove existing module directory before regeneration #[arg(long)] overwrite: bool, }, } fn main() -> Result<()> { let cli = Cli::parse(); let base_dir = cli.base_dir.canonicalize().unwrap_or(cli.base_dir); let metadata_dir = base_dir.join("ai").join("metadata"); let schema_path = metadata_dir.join("schema.json"); let packages_dir = metadata_dir.join("packages"); let (_schema_value, schema) = load_schema(&schema_path)?; let packages = scan_packages(&packages_dir)?; let mut had_errors = false; for package in &packages { let validation = schema.validate(&package.value); if let Err(errors) = validation { had_errors = true; eprintln!( "Schema validation failed for {}:", package.relative_path.display() ); for err in errors { eprintln!(" - {}", err); } } if let Some(err) = &package.summary_error { had_errors = true; eprintln!( "Summary extraction failed for {}: {}", package.relative_path.display(), err ); } } match cli.command { Command::Validate => { if had_errors { anyhow::bail!("metadata validation failed"); } } Command::Index { compact } => { if had_errors { anyhow::bail!("metadata validation failed; index not updated"); } let summaries: Vec<_> = packages .iter() .filter_map(|pkg| pkg.summary.clone()) .collect(); let schema_version = summaries .first() .map(|s| s.schema_version.as_str()) .unwrap_or("v0.0.0"); let generated_at = Utc::now().to_rfc3339(); let packages_json: Vec = summaries .iter() .map(|s| { json!({ "id": s.id.clone(), "name": s.name.clone(), "version": s.version.clone(), "stage": s.stage.clone(), "book": s.book.clone(), "variant": s.variant.clone(), "status": s.status.clone(), "path": s.relative_path.clone(), }) }) .collect(); let index = json!({ "generated_at": generated_at, "schema_version": schema_version, "packages": packages_json, }); let index_path = metadata_dir.join("index.json"); let serialized = if compact { serde_json::to_string(&index)? } else { serde_json::to_string_pretty(&index)? }; fs::write(&index_path, serialized) .with_context(|| format!("writing {}", index_path.display()))?; println!("Updated {}", index_path.display()); } Command::Harvest { book, page, base_url, output, dry_run, } => { let book_lower = book.to_lowercase(); let harvest = harvest_metadata(&metadata_dir, &book_lower, &page, base_url.as_deref())?; if dry_run { println!("{}", serde_json::to_string_pretty(&harvest.value)?); } else { let output_path = output.unwrap_or_else(|| { metadata_dir .join("packages") .join(&book_lower) .join(format!("{}.json", harvest.slug)) }); if let Some(parent) = output_path.parent() { fs::create_dir_all(parent) .with_context(|| format!("creating directory {}", parent.display()))?; } fs::write(&output_path, serde_json::to_string_pretty(&harvest.value)?) .with_context(|| format!("writing {}", output_path.display()))?; println!( "Harvested metadata for {} -> {}", harvest.package_id, output_path.display() ); println!( "Run `metadata_indexer --base-dir {} index` to refresh the index.", base_dir.display() ); } } Command::Refresh { books, force } => { let unique: HashSet<_> = books.into_iter().map(|b| b.to_lowercase()).collect(); let mut refreshed = 0usize; for book in unique { for kind in [ManifestKind::WgetList, ManifestKind::Md5Sums] { match refresh_manifest(&metadata_dir, &book, kind, force) { Ok(path) => { refreshed += 1; println!( "Refreshed {} manifest for {} -> {}", kind.description(), book, path.display() ); } Err(err) => { eprintln!( "warning: failed to refresh {} manifest for {}: {}", kind.description(), book, err ); } } } } if refreshed == 0 { println!("No manifests refreshed (check warnings above)."); } } Command::Generate { metadata, output, overwrite, } => { if overwrite { match generator::module_directory(&metadata, &output) { Ok(dir) if dir.exists() => { fs::remove_dir_all(&dir).with_context(|| { format!("removing existing module {}", dir.display()) })?; } Ok(_) => {} Err(err) => { eprintln!( "warning: could not determine existing module directory: {}", err ); } } } let module_path = generator::generate_module(&metadata, &output)?; println!("Generated module at {}", module_path.display()); } } Ok(()) } fn load_schema(path: &Path) -> Result<(&'static Value, JSONSchema)> { let data = fs::read_to_string(path) .with_context(|| format!("reading schema file {}", path.display()))?; let value: Value = serde_json::from_str(&data) .with_context(|| format!("parsing JSON schema {}", path.display()))?; let leaked = Box::leak(Box::new(value)); let schema = JSONSchema::compile(leaked).context("compiling JSON schema")?; Ok((leaked, schema)) } fn scan_packages(dir: &Path) -> Result> { if !dir.exists() { return Ok(Vec::new()); } let mut records = Vec::new(); for entry in WalkDir::new(dir) .follow_links(false) .into_iter() .filter_map(|e| e.ok()) .filter(|e| e.file_type().is_file()) .filter(|e| e.path().extension().and_then(|s| s.to_str()) == Some("json")) { let path = entry.into_path(); let data = fs::read_to_string(&path) .with_context(|| format!("reading package metadata {}", path.display()))?; let value: Value = serde_json::from_str(&data) .with_context(|| format!("parsing package JSON {}", path.display()))?; let relative_path = path .strip_prefix(dir.parent().unwrap_or(Path::new(""))) .unwrap_or(&path) .to_path_buf(); let (summary, summary_error) = match extract_summary(&value, &relative_path) { Ok(summary) => (Some(summary), None), Err(err) => (None, Some(err)), }; records.push(PackageRecord { value, relative_path, summary, summary_error, }); } Ok(records) } #[derive(Clone)] struct PackageSummary { schema_version: String, id: String, name: String, version: String, stage: Option, book: String, variant: Option, status: String, relative_path: String, } struct PackageRecord { value: Value, relative_path: PathBuf, summary: Option, summary_error: Option, } fn extract_summary(value: &Value, relative_path: &Path) -> Result { let schema_version = value .get("schema_version") .and_then(Value::as_str) .context("missing schema_version")? .to_string(); let package = value.get("package").context("missing package block")?; let status = value.get("status").context("missing status block")?; let id = package .get("id") .and_then(Value::as_str) .context("missing package.id")? .to_string(); let name = package .get("name") .and_then(Value::as_str) .context("missing package.name")? .to_string(); let version = package .get("version") .and_then(Value::as_str) .context("missing package.version")? .to_string(); let book = package .get("book") .and_then(Value::as_str) .context("missing package.book")? .to_string(); let stage = package .get("stage") .and_then(Value::as_str) .map(|s| s.to_string()); let variant = package .get("variant") .and_then(Value::as_str) .map(|s| s.to_string()); let status_state = status .get("state") .and_then(Value::as_str) .context("missing status.state")? .to_string(); Ok(PackageSummary { schema_version, id, name, version, stage, book, variant, status: status_state, relative_path: relative_path .to_str() .unwrap_or_default() .replace('\\', "/"), }) } struct HarvestResult { value: Value, slug: String, package_id: String, } fn harvest_metadata( metadata_dir: &Path, book: &str, page: &str, override_base: Option<&str>, ) -> Result { let page_url = resolve_page_url(book, page, override_base)?; let client = Client::builder() .user_agent("lpkg-metadata-indexer/0.1") .build()?; let response = client .get(&page_url) .send() .with_context(|| format!("fetching {}", page_url))? .error_for_status() .with_context(|| format!("non-success status for {}", page_url))?; let html = response .text() .with_context(|| format!("reading response body from {}", page_url))?; let document = Html::parse_document(&html); let harvest = build_metadata_value(metadata_dir, book, &page_url, &document, &html)?; Ok(harvest) } fn resolve_page_url(book: &str, page: &str, override_base: Option<&str>) -> Result { if page.starts_with("http://") || page.starts_with("https://") { return Ok(page.to_string()); } let base = override_base .map(|s| s.to_string()) .or_else(|| default_base_url(book).map(|s| s.to_string())) .context("no base URL available for book")?; let base = base.trim_end_matches('/'); let mut page_path = page.trim_start_matches('/').to_string(); if page_path.is_empty() { page_path = "index.html".to_string(); } if !page_path.ends_with(".html") { page_path.push_str(".html"); } Ok(format!("{}/{}", base, page_path)) } fn default_base_url(book: &str) -> Option<&'static str> { match book { "lfs" => Some("https://www.linuxfromscratch.org/lfs/view/12.1"), "mlfs" => Some("https://linuxfromscratch.org/~thomas/multilib-m32"), "blfs" => Some("https://www.linuxfromscratch.org/blfs/view/systemd"), "glfs" => Some("https://www.linuxfromscratch.org/glfs/view/glfs"), _ => None, } } fn build_metadata_value( metadata_dir: &Path, book: &str, page_url: &str, document: &Html, html: &str, ) -> Result { let heading_selector = Selector::parse("h1.sect1").unwrap(); let heading = document .select(&heading_selector) .next() .context("no

found")?; let heading_text = heading .text() .map(|t| t.replace('\u{00a0}', " ")) .collect::>() .join(" "); let heading_clean = normalize_whitespace(&heading_text); let heading_re = Regex::new(r"^(?P
\d+\.\d+)\.\s+(?P.+)$")?; let caps = heading_re .captures(&heading_clean) .with_context(|| format!("unable to parse heading '{}'", heading_clean))?; let section = caps["section"].to_string(); let title = caps["title"].trim().to_string(); let (name, version, variant) = split_name_variant(&title); let chapter_num: u32 = section .split('.') .next() .and_then(|s| s.parse().ok()) .unwrap_or(0); let stage = stage_for_chapter(chapter_num).map(|s| s.to_string()); let slug_base = slugify(&name); let slug = variant .as_ref() .map(|v| format!("{}-{}", slug_base, slugify(v))) .unwrap_or_else(|| slug_base.clone()); let package_id = format!("{}/{}", book, slug); let package_id_for_json = package_id.clone(); let anchor_url = heading .value() .id() .map(|id| format!("{}#{}", page_url, id)) .or_else(|| locate_child_anchor(&heading).map(|id| format!("{}#{}", page_url, id))) .or_else(|| { let anchor_selector = Selector::parse("a[id]").unwrap(); document .select(&anchor_selector) .filter_map(|a| a.value().attr("id")) .find(|id| id.contains(&slug_base)) .map(|id| format!("{}#{}", page_url, id)) }) .or_else(|| { let escaped = regex::escape(&slug_base); let pattern = format!(r#"id=\"([^\"]*{}[^\"]*)\""#, escaped); Regex::new(&pattern) .ok() .and_then(|re| re.captures(html)) .and_then(|caps| caps.get(1)) .map(|m| format!("{}#{}", page_url, m.as_str())) }); let mut source_urls = collect_tarball_urls(page_url, document); let mut archive_name = infer_archive_from_commands(document).or_else(|| { source_urls.iter().find_map(|entry| { entry .url .path_segments() .and_then(|mut iter| iter.next_back()) .map(|s| s.to_string()) }) }); if source_urls.is_empty() { match fallback_urls_from_wget(metadata_dir, book, &slug_base, &version) { Ok(fallback) => { if !fallback.is_empty() { eprintln!( "info: using {} URL(s) from wget-list for {} {}", fallback.len(), slug_base, version ); source_urls = fallback; } } Err(err) => { eprintln!( "warning: failed to consult wget-list for {} {}: {}", slug_base, version, err ); } } } if archive_name.is_none() { archive_name = source_urls.iter().find_map(|entry| { entry .url .path_segments() .and_then(|mut iter| iter.next_back()) .map(|s| s.to_string()) }); if archive_name.is_none() { eprintln!( "warning: unable to infer archive name from source URLs for {} {}", slug_base, version ); } } let (sbu, disk) = extract_artifacts(document); let build_steps = extract_build_steps(document); let mut issues = Vec::new(); if anchor_url.is_none() { issues.push("Could not locate anchor id for primary heading".to_string()); } if source_urls.is_empty() { issues.push("No source URLs with archive extensions detected".to_string()); } if build_steps.is_empty() { issues.push("No <pre class=\"userinput\"> blocks found for build commands".to_string()); } let source_urls_json: Vec<Value> = source_urls .iter() .map(|entry| { json!({ "url": entry.url.as_str(), "kind": entry.kind, }) }) .collect(); let checksum_entries = match resolve_checksums(metadata_dir, book, archive_name.as_deref()) { Ok(values) => values, Err(err) => { eprintln!( "warning: failed to resolve checksums for {} {}: {}", slug_base, version, err ); Vec::new() } }; let build_json: Vec<Value> = build_steps .iter() .map(|step| { json!({ "phase": step.phase, "commands": step.commands, "cwd": step.cwd, "requires_root": step.requires_root, "notes": step.notes, }) }) .collect(); let body_selector = Selector::parse("body").unwrap(); let book_release = document .select(&body_selector) .next() .and_then(|body| body.value().id()) .map(|id| id.to_string()) .unwrap_or_default(); let retrieved_at = Utc::now().to_rfc3339(); let content_hash = hex::encode(Sha256::digest(html.as_bytes())); let anchors_value = match anchor_url { Some(ref href) => json!({ "section": href }), None => json!({}), }; let status_state = "draft"; let package_json = json!({ "schema_version": "v0.1.0", "package": { "id": package_id_for_json, "name": name, "upstream": Option::<String>::None, "version": version, "book": book, "chapter": chapter_num, "section": section, "stage": stage, "variant": variant, "anchors": anchors_value, }, "source": { "urls": source_urls_json, "archive": archive_name, "checksums": checksum_entries, }, "artifacts": { "sbu": sbu, "disk": disk, "install_prefix": Option::<String>::None, }, "dependencies": { "build": Vec::<Value>::new(), "runtime": Vec::<Value>::new(), }, "environment": { "variables": Vec::<Value>::new(), "users": Vec::<Value>::new(), }, "build": build_json, "optimizations": { "enable_lto": true, "enable_pgo": true, "cflags": ["-O3", "-flto"], "ldflags": ["-flto"], "profdata": Option::<String>::None, }, "provenance": { "book_release": book_release, "page_url": page_url, "retrieved_at": retrieved_at, "content_hash": content_hash, }, "status": { "state": status_state, "issues": issues, } }); Ok(HarvestResult { value: package_json, slug, package_id, }) } fn normalize_whitespace(input: &str) -> String { let mut result = String::with_capacity(input.len()); let mut prev_space = false; for ch in input.chars() { if ch.is_whitespace() { if !prev_space { result.push(' '); prev_space = true; } } else { prev_space = false; result.push(ch); } } result.trim().to_string() } fn slugify(input: &str) -> String { let mut result = String::new(); let mut prev_dash = false; for ch in input.chars() { let normalized = match ch { 'A'..='Z' => ch.to_ascii_lowercase(), 'a'..='z' | '0'..='9' => ch, _ => '-', }; if normalized == '-' { if !prev_dash { result.push('-'); prev_dash = true; } } else { prev_dash = false; result.push(normalized); } } result.trim_matches('-').to_string() } fn split_name_variant(title: &str) -> (String, String, Option<String>) { let mut base = title.trim().to_string(); let mut variant = None; if let Some(idx) = base.rfind(" - ") { variant = Some(base[idx + 3..].trim().to_string()); base = base[..idx].trim().to_string(); } let bytes = base.as_bytes(); for idx in (0..bytes.len()).rev() { if bytes[idx] == b'-' { if let Some(next) = bytes.get(idx + 1) { if next.is_ascii_digit() { let name = base[..idx].trim(); let version = base[idx + 1..].trim(); if !name.is_empty() && !version.is_empty() { return (name.to_string(), version.to_string(), variant); } } } } } (base, String::from("unknown"), variant) } fn stage_for_chapter(chapter: u32) -> Option<&'static str> { match chapter { 5 => Some("cross-toolchain"), 6 | 7 => Some("temporary-tools"), 8 => Some("system"), 9 => Some("system-configuration"), 10 => Some("system-finalization"), _ => None, } } struct SourceUrlEntry { url: url::Url, kind: &'static str, } #[derive(Clone, Copy)] enum ManifestKind { WgetList, Md5Sums, } impl ManifestKind { fn filename(&self) -> &'static str { match self { ManifestKind::WgetList => "wget-list.txt", ManifestKind::Md5Sums => "md5sums.txt", } } fn description(&self) -> &'static str { match self { ManifestKind::WgetList => "wget-list", ManifestKind::Md5Sums => "md5sums", } } } fn collect_tarball_urls(page_url: &str, document: &Html) -> Vec<SourceUrlEntry> { let base = url::Url::parse(page_url).ok(); let link_selector = Selector::parse("a").unwrap(); let mut seen = HashSet::new(); let mut results = Vec::new(); for link in document.select(&link_selector) { if let Some(href) = link.value().attr("href") { if let Some(kind) = classify_artifact_url(href) { let resolved = match (&base, url::Url::parse(href)) { (_, Ok(url)) => url, (Some(base_url), Err(_)) => match base_url.join(href) { Ok(url) => url, Err(_) => continue, }, _ => continue, }; if seen.insert(resolved.clone()) { results.push(SourceUrlEntry { url: resolved, kind, }); } } } } results } fn classify_artifact_url(href: &str) -> Option<&'static str> { let lower = href.to_lowercase(); if lower.ends_with(".tar") || lower.ends_with(".tar.gz") || lower.ends_with(".tar.bz2") || lower.ends_with(".tar.xz") || lower.ends_with(".tgz") || lower.ends_with(".zip") { Some("primary") } else if lower.ends_with(".patch") { Some("patch") } else if lower.ends_with(".sig") || lower.ends_with(".asc") { Some("signature") } else { None } } fn fallback_urls_from_wget( metadata_dir: &Path, book: &str, slug: &str, version: &str, ) -> Result<Vec<SourceUrlEntry>> { let manifest = load_jhalfs_manifest(metadata_dir, book, ManifestKind::WgetList)?; let needle = format!("{}-{}", slug.replace('_', "-"), version); eprintln!("debug: searching wget-list for '{}'", needle); let mut entries = Vec::new(); for line in manifest.lines() { if line.contains(&needle) { if let Ok(url) = url::Url::parse(line.trim()) { eprintln!("info: matched wget URL {}", url); entries.push(SourceUrlEntry { url, kind: "primary", }); } else { eprintln!( "warning: unable to parse URL from wget-list line: {}", line.trim() ); } } } if entries.is_empty() { eprintln!("warning: no wget-list entries matched '{}'", needle); } Ok(entries) } fn resolve_checksums( metadata_dir: &Path, book: &str, archive_name: Option<&str>, ) -> Result<Vec<Value>> { let mut checksums = Vec::new(); let Some(archive) = archive_name else { return Ok(checksums); }; let manifest = load_jhalfs_manifest(metadata_dir, book, ManifestKind::Md5Sums)?; for line in manifest.lines() { let trimmed = line.trim(); if trimmed.is_empty() { continue; } let mut parts = trimmed.split_whitespace(); let Some(hash) = parts.next() else { continue }; let Some(file) = parts.next() else { continue }; if file == archive { checksums.push(json!({ "alg": "md5", "value": hash.to_lowercase(), })); break; } } Ok(checksums) } fn load_jhalfs_manifest(metadata_dir: &Path, book: &str, kind: ManifestKind) -> Result<String> { let cache_path = refresh_manifest(metadata_dir, book, kind, false)?; fs::read_to_string(&cache_path) .with_context(|| format!("reading cached manifest {}", cache_path.display())) } fn refresh_manifest( metadata_dir: &Path, book: &str, kind: ManifestKind, force: bool, ) -> Result<PathBuf> { let cache_dir = metadata_dir.join("cache"); fs::create_dir_all(&cache_dir) .with_context(|| format!("creating cache directory {}", cache_dir.display()))?; let cache_path = cache_dir.join(format!("{}-{}", book, kind.filename())); if cache_path.exists() && !force { return Ok(cache_path); } let url = manifest_url(book, &kind) .with_context(|| format!("no manifest URL configured for book '{}'", book))?; let client = Client::builder().redirect(Policy::limited(5)).build()?; let body = client .get(url) .send() .with_context(|| format!("fetching {}", url))? .error_for_status() .with_context(|| format!("request failed for {}", url))? .text() .with_context(|| format!("reading response body from {}", url))?; fs::write(&cache_path, &body) .with_context(|| format!("caching manifest {}", cache_path.display()))?; Ok(cache_path) } fn manifest_url(book: &str, kind: &ManifestKind) -> Option<&'static str> { match (book, kind) { ("mlfs", ManifestKind::WgetList) => { Some("https://www.linuxfromscratch.org/~thomas/multilib-m32/wget-list-sysv") } ("mlfs", ManifestKind::Md5Sums) => { Some("https://www.linuxfromscratch.org/~thomas/multilib-m32/md5sums") } ("lfs", ManifestKind::WgetList) => { Some("https://www.linuxfromscratch.org/lfs/view/12.1/wget-list") } ("lfs", ManifestKind::Md5Sums) => { Some("https://www.linuxfromscratch.org/lfs/view/12.1/md5sums") } ("blfs", ManifestKind::WgetList) => { Some("https://anduin.linuxfromscratch.org/BLFS/view/systemd/wget-list") } ("blfs", ManifestKind::Md5Sums) => { Some("https://anduin.linuxfromscratch.org/BLFS/view/systemd/md5sums") } ("glfs", ManifestKind::WgetList) => { Some("https://www.linuxfromscratch.org/glfs/view/glfs/wget-list") } ("glfs", ManifestKind::Md5Sums) => { Some("https://www.linuxfromscratch.org/glfs/view/glfs/md5sums") } _ => None, } } fn locate_child_anchor(heading: &ElementRef) -> Option<String> { let mut current = heading.first_child(); while let Some(node) = current { if let Some(element) = ElementRef::wrap(node) { if let Some(id) = element .value() .attr("id") .or_else(|| element.value().attr("name")) { return Some(id.to_string()); } } current = node.next_sibling(); } None } fn infer_archive_from_commands(document: &Html) -> Option<String> { let pre_selector = Selector::parse("pre.userinput").unwrap(); for pre in document.select(&pre_selector) { let text = pre.text().collect::<Vec<_>>().join("\n"); for line in text.lines() { if let Some(start) = line.find("tar -xf") { let args = line[start + 7..].trim(); let parts: Vec<&str> = args.split_whitespace().collect(); if let Some(archive) = parts.get(0) { let cleaned = archive.trim_matches(['"', '\'', ','].as_ref()); if cleaned.ends_with(".tar") || cleaned.contains(".tar.") || cleaned.ends_with(".tgz") || cleaned.ends_with(".zip") { return Some(cleaned.trim_start_matches("../").to_string()); } } } } } None } fn extract_artifacts(document: &Html) -> (Option<f64>, Option<i64>) { let seg_selector = Selector::parse("div.segmentedlist div.seg").unwrap(); let title_selector = Selector::parse("strong.segtitle").unwrap(); let body_selector = Selector::parse("span.segbody").unwrap(); let mut sbu = None; let mut disk = None; for seg in document.select(&seg_selector) { let title = seg .select(&title_selector) .next() .map(|n| normalize_whitespace(&n.text().collect::<Vec<_>>().join(""))); let body = seg .select(&body_selector) .next() .map(|n| normalize_whitespace(&n.text().collect::<Vec<_>>().join(""))); if let (Some(title), Some(body)) = (title, body) { if title.contains("Approximate build time") { if let Some(value) = parse_numeric(&body) { sbu = Some(value); } } else if title.contains("Required disk space") { if let Some(value) = parse_numeric(&body) { disk = Some(value as i64); } } } } (sbu, disk) } fn parse_numeric(input: &str) -> Option<f64> { let re = Regex::new(r"([0-9]+(?:\\.[0-9]+)?)").ok()?; re.captures(input) .and_then(|caps| caps.get(1)) .and_then(|m| m.as_str().parse().ok()) } struct BuildStep { phase: &'static str, commands: Vec<String>, cwd: Option<String>, requires_root: bool, notes: Option<String>, } fn extract_build_steps(document: &Html) -> Vec<BuildStep> { let pre_selector = Selector::parse("pre.userinput").unwrap(); let mut steps = Vec::new(); for pre in document.select(&pre_selector) { let code = pre.text().collect::<Vec<_>>().join("\n"); let commands: Vec<String> = code .lines() .map(|line| line.trim().to_string()) .filter(|line| !line.is_empty()) .collect(); if commands.is_empty() { continue; } let phase = classify_phase(&commands); steps.push(BuildStep { phase, commands, cwd: None, requires_root: false, notes: None, }); } steps } fn classify_phase(commands: &[String]) -> &'static str { let joined = commands.join("\n").to_lowercase(); if joined.contains("make install") { "install" } else if joined.contains("make -k check") || joined.contains("make check") { "test" } else if joined.contains("configure") { "configure" } else if joined.contains("tar -xf") || joined.contains("mkdir ") { "setup" } else { "build" } }