Fetch list of properties for wiki items

master
Raymon Zutekouw 2 days ago
parent 0023597690
commit cd8d5f14d1
Signed by: raymon
GPG Key ID: 0E62222846283925
  1. 199
      Cargo.lock
  2. 3
      Cargo.toml
  3. 44
      src/lib.rs

199
Cargo.lock generated

@ -82,6 +82,12 @@ dependencies = [
"syn",
]
[[package]]
name = "atomic-waker"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
[[package]]
name = "autocfg"
version = "1.5.0"
@ -170,6 +176,7 @@ dependencies = [
"iana-time-zone",
"js-sys",
"num-traits",
"serde",
"wasm-bindgen",
"windows-link",
]
@ -341,6 +348,12 @@ dependencies = [
"litrs",
]
[[package]]
name = "either"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
[[package]]
name = "encoding_rs"
version = "0.8.35"
@ -530,6 +543,25 @@ version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
[[package]]
name = "h2"
version = "0.4.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17da50a276f1e01e0ba6c029e47b7100754904ee8a278f886546e98575380785"
dependencies = [
"atomic-waker",
"bytes",
"fnv",
"futures-core",
"futures-sink",
"http",
"indexmap",
"slab",
"tokio",
"tokio-util",
"tracing",
]
[[package]]
name = "hashbrown"
version = "0.14.5"
@ -613,6 +645,7 @@ dependencies = [
"bytes",
"futures-channel",
"futures-util",
"h2",
"http",
"http-body",
"httparse",
@ -623,6 +656,22 @@ dependencies = [
"want",
]
[[package]]
name = "hyper-rustls"
version = "0.27.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
dependencies = [
"http",
"hyper",
"hyper-util",
"rustls",
"rustls-pki-types",
"tokio",
"tokio-rustls",
"tower-service",
]
[[package]]
name = "hyper-tls"
version = "0.6.0"
@ -658,9 +707,11 @@ dependencies = [
"percent-encoding",
"pin-project-lite",
"socket2",
"system-configuration",
"tokio",
"tower-service",
"tracing",
"windows-registry",
]
[[package]]
@ -831,6 +882,15 @@ dependencies = [
"serde",
]
[[package]]
name = "itertools"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.15"
@ -858,6 +918,12 @@ dependencies = [
"serde",
]
[[package]]
name = "lazy_static"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "libc"
version = "0.2.174"
@ -926,6 +992,12 @@ version = "2.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0"
[[package]]
name = "mime"
version = "0.3.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
[[package]]
name = "minimal-lexical"
version = "0.2.1"
@ -1311,17 +1383,21 @@ dependencies = [
"bytes",
"cookie",
"cookie_store",
"encoding_rs",
"futures-channel",
"futures-core",
"futures-util",
"h2",
"http",
"http-body",
"http-body-util",
"hyper",
"hyper-rustls",
"hyper-tls",
"hyper-util",
"js-sys",
"log",
"mime",
"native-tls",
"percent-encoding",
"pin-project-lite",
@ -1341,6 +1417,20 @@ dependencies = [
"web-sys",
]
[[package]]
name = "ring"
version = "0.17.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
dependencies = [
"cc",
"cfg-if",
"getrandom 0.2.16",
"libc",
"untrusted",
"windows-sys 0.52.0",
]
[[package]]
name = "ron"
version = "0.8.1"
@ -1382,6 +1472,19 @@ dependencies = [
"windows-sys 0.60.2",
]
[[package]]
name = "rustls"
version = "0.23.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc"
dependencies = [
"once_cell",
"rustls-pki-types",
"rustls-webpki",
"subtle",
"zeroize",
]
[[package]]
name = "rustls-pki-types"
version = "1.12.0"
@ -1391,6 +1494,17 @@ dependencies = [
"zeroize",
]
[[package]]
name = "rustls-webpki"
version = "0.103.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc"
dependencies = [
"ring",
"rustls-pki-types",
"untrusted",
]
[[package]]
name = "rustversion"
version = "1.0.21"
@ -1596,6 +1710,27 @@ dependencies = [
"syn",
]
[[package]]
name = "system-configuration"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
dependencies = [
"bitflags",
"core-foundation",
"system-configuration-sys",
]
[[package]]
name = "system-configuration-sys"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "tempfile"
version = "3.20.0"
@ -1720,6 +1855,29 @@ dependencies = [
"tokio",
]
[[package]]
name = "tokio-rustls"
version = "0.26.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b"
dependencies = [
"rustls",
"tokio",
]
[[package]]
name = "tokio-util"
version = "0.7.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5"
dependencies = [
"bytes",
"futures-core",
"futures-sink",
"pin-project-lite",
"tokio",
]
[[package]]
name = "toml"
version = "0.8.23"
@ -1861,6 +2019,12 @@ version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
[[package]]
name = "untrusted"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
[[package]]
name = "url"
version = "2.5.4"
@ -2001,15 +2165,30 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "wikidata"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a34b80c94c1d2e5df9e3e7dcb886e646b2cf148773f689da670d8f11849d8c72"
dependencies = [
"chrono",
"lazy_static",
"serde",
"serde_json",
]
[[package]]
name = "wikipedia-infobox-analyzer"
version = "0.1.0"
dependencies = [
"ascii_table",
"itertools",
"mediawiki",
"regex",
"reqwest",
"serde_json",
"tokio",
"wikidata",
]
[[package]]
@ -2053,6 +2232,17 @@ version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"
[[package]]
name = "windows-registry"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e"
dependencies = [
"windows-link",
"windows-result",
"windows-strings",
]
[[package]]
name = "windows-result"
version = "0.3.4"
@ -2071,6 +2261,15 @@ dependencies = [
"windows-link",
]
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-sys"
version = "0.59.0"

@ -17,6 +17,9 @@ categories = ["command-line-utilities"]
[dependencies]
regex = "1.11.1"
ascii_table = "4.0.7"
itertools = "0.14.0"
mediawiki = "0.3.1"
reqwest = { version = "0.12", features = ["blocking"] }
serde_json = "1.0.64"
tokio = "1.47.1"
wikidata = "1.1.0"

@ -2,6 +2,7 @@
pub mod wikipedia_infobox_analyzer {
use ascii_table::{Align, AsciiTable};
use regex::Regex;
use itertools::Itertools;
/// Looks for a used template that does lists wikidata properties listing
pub fn extract_used_properties_from_template(template: String) -> Vec<String> {
@ -64,6 +65,33 @@ pub mod wikipedia_infobox_analyzer {
return label[1].to_string();
}
/// Fetches the properties which have claims for the given wiki item inside of wikidata
pub fn fetch_properties_for_wiki_item(qid: u64) -> Vec<u64> {
let mut ids: Vec<u64> = vec![];
let uri = format!("https://www.wikidata.org/wiki/Special:EntityData/Q{qid}.json");
let res = reqwest::blocking::get(uri).unwrap();
let text = res.text().unwrap();
if text.contains("<h1>Not Found</h1><p>No entity with ID ") {
return vec![];
}
let ent = wikidata::Entity::from_json(serde_json::from_str(&text).unwrap()).unwrap();
if ent.claims.is_empty() {
return vec![];
}
for i in 0..ent.claims.len() {
let (wikidata::Pid(pid), _) = &ent.claims[i];
ids.push(*pid);
}
ids.sort();
ids.iter()
.map(|n| n.to_owned())
.unique()
.collect::<Vec<u64>>()
}
}
#[cfg(test)]
@ -121,4 +149,20 @@ mod tests {
let name = fetch_name_for_wiki_property(31).await;
assert_eq!(name, "instance of")
}
#[test]
fn test_fetch_properties_for_wiki_item() {
let qid = 16639197; // identifier for GitLab on wikidata
let properties = fetch_properties_for_wiki_item(qid);
// Properties which should be present:
assert!(properties.contains(&10)); // property: video
assert!(properties.contains(&18)); // property: image
assert!(properties.contains(&154)); // property: logo image
assert!(properties.contains(&178)); // property: developer
// Properties which should not be present:
assert!(!properties.contains(&19)); // property: place of birth
assert!(!properties.contains(&30)); // property: continent
}
}

Loading…
Cancel
Save