Integrate metadata documentation and jhalfs manifests

This commit is contained in:
m00d 2025-10-01 06:58:04 +02:00
parent 74bf8a32d6
commit 3ce470e019
34 changed files with 5544 additions and 240 deletions

113
src/ingest/blfs.rs Normal file
View file

@ -0,0 +1,113 @@
use anyhow::{Context, Result};
use regex::Regex;
use reqwest::blocking::Client;
use scraper::{Html, Selector};
use super::{BookPackage, FetchOptions};
use crate::ingest::lfs::split_name_version;
pub fn fetch_book(options: &FetchOptions) -> Result<Vec<BookPackage>> {
let base = options.base_url.trim_end_matches('/');
let url = format!("{base}/book.html");
let client = Client::builder().build().context("building HTTP client")?;
let body = client
.get(&url)
.send()
.with_context(|| format!("fetching {}", url))?
.error_for_status()
.with_context(|| format!("request failed for {}", url))?
.text()
.context("reading response body")?;
parse_book_html(options, &url, &body)
}
pub fn parse_book_html(
options: &FetchOptions,
book_url: &str,
body: &str,
) -> Result<Vec<BookPackage>> {
let document = Html::parse_document(body);
let selector = Selector::parse("h1.sect1").unwrap();
let numbering_re =
Regex::new(r"^(?P<chapter>\d+)\.(?P<section>\d+)\.\s+(?P<title>.+)$").unwrap();
let mut results = Vec::new();
for heading in document.select(&selector) {
let text = heading
.text()
.map(str::trim)
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ")
.replace('\n', " ")
.trim()
.to_string();
if text.is_empty() {
continue;
}
// BLFS headings often look like "33.2. Bzip2" or "33.2. Bzip2-1.0.8"
let caps = match numbering_re.captures(&text) {
Some(caps) => caps,
None => continue,
};
let chapter_num: u32 = caps["chapter"].parse().unwrap_or(0);
let section_num: u32 = caps["section"].parse().unwrap_or(0);
let title = caps["title"].trim();
let (name, version, variant) = match split_name_version(title) {
Some(parts) => parts,
None => continue,
};
let href = heading.value().id().map(|id| {
let mut base = book_url.to_string();
if !base.contains('#') {
base.push('#');
}
format!("{}{}", base, id)
});
let section_label = Some(format!("{}.{}", chapter_num, section_num));
results.push(BookPackage {
book: options.book,
chapter: Some(chapter_num),
section: section_label,
name,
version: Some(version),
href,
md5: None,
stage: None,
variant,
notes: None,
});
}
Ok(results)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ingest::BookKind;
#[test]
fn parse_blfs_sample() {
let html = r#"
<html><body>
<h1 class=\"sect1\" id=\"ch33-bzip2\">33.2. Bzip2-1.0.8</h1>
<h1 class=\"sect1\" id=\"ch33-about\">33.1. Introduction</h1>
</body></html>
"#;
let opts = FetchOptions::new("https://example.invalid/blfs", BookKind::Blfs);
let items = parse_book_html(&opts, "https://example.invalid/blfs/book.html", html).unwrap();
assert_eq!(items.len(), 1);
assert_eq!(items[0].name, "Bzip2");
assert_eq!(items[0].version.as_deref(), Some("1.0.8"));
}
}