use std::collections::BTreeMap as Map; use std::collections::HashSet; use crate::error::Error; use crate::hash::sha256; use crate::rdf::{BlankNodeLabel, DataSet, Predicate, Statement}; /// #[derive(Debug, Clone)] pub struct NormalizationState<'a> { pub blank_node_to_quads: Map<&'a str, Vec<&'a Statement>>, pub hash_to_blank_nodes: Map>, pub canonical_issuer: IdentifierIssuer, } /// /// #[derive(Debug, Clone)] pub struct IdentifierIssuer { pub identifier_prefix: String, pub identifier_counter: u64, pub issued_identifiers_list: Vec<(String, String)>, } impl IdentifierIssuer { pub fn new(prefix: String) -> Self { Self { identifier_prefix: prefix, identifier_counter: 0, issued_identifiers_list: Vec::new(), } } pub fn find_issued_identifier(&self, existing_identifier: &str) -> Option<&str> { // TODO(optimize): index issued_identifiers_list by existing_identifier self.issued_identifiers_list .iter() .find(|(_, existing_id)| existing_id == existing_identifier) .map(|(issued_identifier, _)| issued_identifier.as_ref()) } } #[derive(Debug, Clone)] pub struct HashNDegreeQuadsOutput { pub hash: String, pub issuer: IdentifierIssuer, } fn digest_to_lowerhex(digest: &[u8]) -> String { digest .iter() .map(|byte| format!("{:02x}", byte)) .collect::() } /// pub fn hash_first_degree_quads( normalization_state: &mut NormalizationState, reference_blank_node_identifier: &str, ) -> Result { // https://json-ld.github.io/rdf-dataset-canonicalization/spec/#algorithm-1 // 1 let mut nquads: Vec = Vec::new(); // 2 if let Some(quads) = normalization_state .blank_node_to_quads .get(reference_blank_node_identifier) { // 3 for quad in quads { // 3.1 let mut quad: Statement = (*quad).clone(); // 3.1.1 for label in quad.blank_node_components_mut() { // 3.1.1.1 label.0 = if label.0 == reference_blank_node_identifier { "_:a".to_string() } else { "_:z".to_string() }; } let nquad = String::from(&quad); nquads.push(nquad); } } // 4 nquads.sort(); // 5 let joined_nquads = nquads.join(""); let nquads_digest = sha256(joined_nquads.as_bytes())?; let hash_hex = digest_to_lowerhex(&nquads_digest); Ok(hash_hex) } /// pub fn normalize(input_dataset: &DataSet) -> Result { // https://json-ld.github.io/rdf-dataset-canonicalization/spec/#algorithm // 1 let mut normalization_state = NormalizationState { blank_node_to_quads: Map::new(), hash_to_blank_nodes: Map::new(), canonical_issuer: IdentifierIssuer::new("_:c14n".to_string()), }; // 2 let input_dataset_quads = input_dataset.statements(); for quad in input_dataset_quads.iter() { // 2.1 for blank_node_identifier in quad.blank_node_components() { normalization_state .blank_node_to_quads .entry(&blank_node_identifier.0) .or_insert_with(Vec::new) .push(quad); } } // 3 let mut non_normalized_identifiers: HashSet<&str> = normalization_state .blank_node_to_quads .keys() .cloned() .collect(); // 4 let mut simple = true; // 5 while simple { // 5.1 simple = false; // 5.2 normalization_state.hash_to_blank_nodes.clear(); // 5.3 for identifier in non_normalized_identifiers.iter() { // 5.3.1 let hash = hash_first_degree_quads(&mut normalization_state, identifier)?; // 5.3.2 normalization_state .hash_to_blank_nodes .entry(hash) .or_insert_with(Vec::new) .push(identifier); } // 5.4 let mut hashes_to_remove = Vec::new(); for (hash, identifier_list) in normalization_state.hash_to_blank_nodes.iter() { // 5.4.1 if identifier_list.len() > 1 { continue; } // 5.4.2 let identifier = match identifier_list.iter().next() { Some(id) => id, None => continue, }; // note: canonical issuer is not passed issue_identifier(&mut normalization_state.canonical_issuer, identifier)?; // 5.4.3 non_normalized_identifiers.remove(identifier); // 5.4.4 // Cannot remove while iterating hashes_to_remove.push(hash.clone()); // 5.4.5 simple = true; } for hash in hashes_to_remove { normalization_state.hash_to_blank_nodes.remove(&hash); } // 6 // Clone normalization_state to avoid mutable borrow for (_hash, identifier_list) in normalization_state.hash_to_blank_nodes.clone() { // 6.1 let mut hash_path_list: Vec = Vec::new(); // 6.2 for identifier in identifier_list { // 6.2.1 if normalization_state .canonical_issuer .find_issued_identifier(identifier) .is_some() { continue; } // 6.2.2 let mut temporary_issuer = IdentifierIssuer::new("_:b".to_string()); // 6.2.3 issue_identifier(&mut temporary_issuer, identifier)?; // 6.2.4 hash_path_list.push(hash_n_degree_quads( &mut normalization_state, identifier, &mut temporary_issuer, )?); } // 6.3 hash_path_list.sort_by(|a, b| a.hash.cmp(&b.hash)); for result in hash_path_list { // 6.3.1 let identifier_issuer = result.issuer; for (_, existing_identifier) in identifier_issuer.issued_identifiers_list { issue_identifier( &mut normalization_state.canonical_issuer, &existing_identifier, )?; } } } } // 7 let mut normalized_dataset = DataSet::default(); for quad in input_dataset_quads.iter() { // 7.1 let mut quad_copy = quad.clone(); for label in quad_copy.blank_node_components_mut() { let canonical_identifier = match normalization_state .canonical_issuer .find_issued_identifier(&label.0) { Some(id) => id, None => return Err(Error::MissingIdentifier), }; label.0 = canonical_identifier.to_string(); } // 7.2 normalized_dataset.add_statement(quad_copy); } // 8 Ok(normalized_dataset) } /// pub fn issue_identifier( identifier_issuer: &mut IdentifierIssuer, existing_identifier: &str, ) -> Result { // https://json-ld.github.io/rdf-dataset-canonicalization/spec/#algorithm-0 // 1 if let Some(id) = identifier_issuer.find_issued_identifier(existing_identifier) { return Ok(id.to_string()); } // 2 let issued_identifier = identifier_issuer.identifier_prefix.to_owned() + &identifier_issuer.identifier_counter.to_string(); // 3 identifier_issuer.issued_identifiers_list.push(( issued_identifier.to_string(), existing_identifier.to_string(), )); // 4 identifier_issuer.identifier_counter += 1; // 5 Ok(issued_identifier) } /// pub fn hash_n_degree_quads( normalization_state: &mut NormalizationState, identifier: &str, issuer: &mut IdentifierIssuer, ) -> Result { let mut issuer = issuer; // https://json-ld.github.io/rdf-dataset-canonicalization/spec/#algorithm-3 let mut issuer_tmp: IdentifierIssuer; // 1 let mut hash_to_related_blank_nodes: Map> = Map::new(); // 2 if let Some(quads) = normalization_state .blank_node_to_quads .get(identifier) // Clone to prevent multiple mutable borrows of normalization state .cloned() { // 3 for quad in quads { // 3.1 for (component, position) in quad.blank_node_components_with_position() { // Not checking for predicate since that cannot be a blank node identifier anyway if component.0 != identifier { // 3.1.1 let hash = hash_related_blank_node( normalization_state, &component.0, quad, issuer, position, )?; // 3.1.2 hash_to_related_blank_nodes .entry(hash) .or_insert_with(Vec::new) .push(component); } } } } // 4 let mut data_to_hash = String::new(); // 5 // Using BTreeMap for sort by hash for (related_hash, blank_node_list) in hash_to_related_blank_nodes { // 5.1 data_to_hash.push_str(&related_hash); // 5.2 let mut chosen_path = String::new(); // 5.3 let mut chosen_issuer = None; // 5.4 for permutation in combination::permutate::from_vec(&blank_node_list) { // 5.4.1 let mut issuer_copy = issuer.clone(); // 5.4.2 let mut path = String::new(); // 5.4.3 let mut recursion_list: Vec = Vec::new(); // 5.4.4 for related in permutation { // 5.4.4.1 if let Some(canonical_identifier) = normalization_state .canonical_issuer .find_issued_identifier(&related.0) .as_ref() { recursion_list.push(canonical_identifier.to_string()); // 5.4.4.2 } else { // 5.4.4.2.1 if issuer_copy.find_issued_identifier(&related.0).is_none() { recursion_list.push(related.0.to_string()); } // 5.4.4.2.2 path += &issue_identifier(&mut issuer_copy, &related.0)?; } // 5.4.4.3 if !chosen_path.is_empty() && path.len() >= chosen_path.len() && path > chosen_path { continue; } } // 5.4.5 for related in recursion_list { // 5.4.5.1 let result = hash_n_degree_quads(normalization_state, &related, &mut issuer_copy)?; // 5.4.5.2 path.push_str(&issue_identifier(&mut issuer_copy, &related)?); // 5.4.5.3 path.push('<'); path.push_str(&result.hash); path.push('>'); // 5.4.5.4 issuer_copy = result.issuer; // 5.4.5.5 if !chosen_path.is_empty() && path.len() >= chosen_path.len() && path > chosen_path { continue; } } // 5.4.6 if chosen_path.is_empty() || path < chosen_path { chosen_path = path; chosen_issuer.replace(issuer_copy); } } // 5.5 data_to_hash.push_str(&chosen_path); // 5.6 issuer_tmp = match chosen_issuer { Some(issuer) => issuer, None => return Err(Error::MissingChosenIssuer), }; issuer = &mut issuer_tmp; } // 6 let digest = sha256(data_to_hash.as_bytes())?; let hash = digest_to_lowerhex(&digest); Ok(HashNDegreeQuadsOutput { hash, issuer: issuer.to_owned(), }) } /// pub fn hash_related_blank_node( normalization_state: &mut NormalizationState, related: &str, quad: &Statement, issuer: &mut IdentifierIssuer, position: char, ) -> Result { // https://json-ld.github.io/rdf-dataset-canonicalization/spec/#algorithm-2 // 1 let identifier = match normalization_state .canonical_issuer .find_issued_identifier(related) { Some(id) => id.to_string(), None => match issuer.find_issued_identifier(related) { Some(id) => id.to_string(), None => hash_first_degree_quads(normalization_state, related)?, }, }; // 2 let mut input = position.to_string(); // 3 if position != 'g' { let Predicate::IRIRef(ref predicate) = quad.predicate; input.push('<'); input.push_str(&predicate.0); input.push('>'); } // 4 input += &identifier; // 5 let digest = sha256(input.as_bytes())?; let hash_hex = digest_to_lowerhex(&digest); Ok(hash_hex) } #[cfg(test)] mod tests { use super::*; #[test] /// fn normalization_test_suite() { use std::fs::{self}; use std::path::PathBuf; use std::str::FromStr; let case = std::env::args().skip(2).next(); // Example usage to run a single test case: // cargo test normalization_test_suite -- test022 let mut passed = 0; let mut total = 0; for entry in fs::read_dir("json-ld-normalization/tests").unwrap() { let entry = entry.unwrap(); let filename = entry.file_name().into_string().unwrap(); if !filename.starts_with("test") || !filename.ends_with("-urdna2015.nq") { continue; } let num = &filename[0..7].to_string(); if let Some(ref case) = case { if case != num { continue; } } total += 1; let mut path = entry.path(); let expected_str = fs::read_to_string(&path).unwrap(); let in_file_name = num.to_string() + "-in.nq"; path.set_file_name(PathBuf::from(in_file_name)); let in_str = fs::read_to_string(&path).unwrap(); let dataset = DataSet::from_str(&in_str).unwrap(); let dataset_normalized = normalize(&dataset).unwrap(); let normalized = dataset_normalized.to_nquads().unwrap(); if &normalized == &expected_str { passed += 1; } else { let changes = difference::Changeset::new(&normalized, &expected_str, "\n"); eprintln!("test {}: failed. diff:\n{}", num, changes); } } assert!(total > 0); assert_eq!(passed, total); } }