// Cargo.toml
[dependencies]
quick-xml = "0.31.0"
// XML Sample : Extract CAS, name, mol formula to pipe-delimited file
<Chemical id="0000034742" displayFormula="C12-H14-O4" displayName="Monobutyl phthalate">
<NameList><NameOfSubstance>Monobutyl phthalate<SourceList><Source>MeSH</Source></SourceList></NameOfSubstance>
<Synonyms>Mono-n-butyl-phthalate<SourceList><Source>NLM</Source></SourceList></Synonyms></NameList><NumberList>
<CASRegistryNumber>34-74-2<SourceList/></CASRegistryNumber></NumberList><ClassificationList/>
<FormulaList><MolecularFormula>C12-H14-O4<SourceList><Source>NLM</Source></SourceList></MolecularFormula></FormulaList>
<FormulaFragmentList/><NoteList/><LocatorList><FileLocator url="https://meshb.nlm.nih.gov/record/ui?name=monobutyl phthalate">MeSH</FileLocator>
<FileLocator url="https://pubchem.ncbi.nlm.nih.gov/#query=34-74-2">PubChem</FileLocator>
<FileLocator url="https://pubmed.ncbi.nlm.nih.gov/?term=monobutyl+phthalate">PubMed</FileLocator>
<FileLocator url="https://www.ncbi.nlm.nih.gov/pmc/?term=%22monobutyl+phthalate%22">PubMed Central</FileLocator>
<InternetLocator url="https://search.usa.gov/search?utf8=?&m=false&affiliate=usagov&query="monobutyl+phthalate"">USA.gov</InternetLocator></LocatorList></Chemical>
use quick_xml::Reader;
use quick_xml::events::Event;
use std::fs::File;
use std::io::{BufReader, BufWriter, Write};
fn extract_chemical_data(
xml_file_path: &str,
output_file_path: &str,
) -> Result<(), Box<dyn std::error::Error>> {
let file = File::open(xml_file_path)?;
let reader = BufReader::new(file);
let mut xml_reader = Reader::from_reader(reader);
xml_reader.trim_text(true);
let output_file = File::create(output_file_path)?;
let mut writer = BufWriter::new(output_file);
let mut buf = Vec::new();
let mut cas_registry_number = String::new();
let mut name_of_substance = String::new();
let mut molecular_formula = String::new();
let mut current_element = String::new();
loop {
match xml_reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
current_element = String::from_utf8(e.name().0.to_vec())?; // Corrected to use e.name().0
match e.name().as_ref() {
b"CASRegistryNumber" => {
cas_registry_number.clear();
}
b"NameOfSubstance" => {
name_of_substance.clear();
}
b"MolecularFormula" => {
molecular_formula.clear();
}
_ => (),
}
}
Ok(Event::Text(e)) => match current_element.as_str() {
"CASRegistryNumber" => cas_registry_number = e.unescape()?.to_string(),
"NameOfSubstance" => name_of_substance = e.unescape()?.to_string(),
"MolecularFormula" => molecular_formula = e.unescape()?.to_string(),
_ => (),
},
Ok(Event::End(ref e)) => {
if e.name().as_ref() == b"Chemical" {
writeln!(
writer,
"{}|{}|{}",
cas_registry_number, name_of_substance, molecular_formula
)?;
cas_registry_number.clear();
name_of_substance.clear();
molecular_formula.clear();
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(Box::new(e)),
_ => (),
}
buf.clear();
}
Ok(())
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
let xml_file_path = "chemid-2023-02-22.xml";
let output_file_path = "output.txt";
extract_chemical_data(xml_file_path, output_file_path)?;
println!("Data extracted successfully!");
Ok(())
}