This commit is contained in:
Lucy 2025-09-30 21:38:22 +02:00
parent 1eb7d4c1eb
commit 35c3f4c423
10 changed files with 353 additions and 738 deletions

View file

@ -1,33 +1,14 @@
use html_parser::Dom;
use reqwest::blocking::get;
use std::error::Error;
use scraper::{Html, Selector};
/// Lädt die HTML-Seite von der angegebenen URL herunter und konvertiert sie in JSON
pub fn fetch_and_parse_html_to_json(url: &str) -> Result<String, Box<dyn Error>> {
// HTML herunterladen
let response = get(url)?;
if !response.status().is_success() {
return Err(format!("Fehler beim Abrufen der URL {}: {}", url, response.status()).into());
pub fn fetch_pre_blocks(url: &str) -> anyhow::Result<Vec<String>> {
let body = reqwest::blocking::get(url)?.text()?;
let document = Html::parse_document(&body);
let selector = Selector::parse("pre").unwrap();
let mut results = Vec::new();
for element in document.select(&selector) {
results.push(element.inner_html());
}
let body = response.text()?;
// HTML parsen
let dom = Dom::parse(&body)?;
// In JSON konvertieren
let json = dom.to_json_pretty()?;
Ok(json)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fetch_and_parse() {
let url = "https://www.linuxfromscratch.org/~thomas/multilib-m32/chapter02/hostreqs.html";
let json = fetch_and_parse_html_to_json(url).expect("Fehler beim Parsen");
assert!(json.contains("Host System Requirements"));
}
Ok(results)
}