Commit 6b1d7e08 authored by Grzegorz Kostkowski's avatar Grzegorz Kostkowski

Init commit

parents
# Overview
ClI script for displaying information about RDF graph content.
It can be useful when defining new conversion config for new resource.
# Usage
```
Usage: rdf-info [OPTIONS] GRAPH OUT
ClI script for displaying information about RDF graph content. It can be
useful when defining new conversion config for new resource.
Arguments:
GRAPH: Path to RDF graph to convert.
OUT: Path for output *.json file.
Options:
-f, --graph-format TEXT Format of the input graph file. Don't need
to pass unless converter is not able to
guess the format correctly. List of
recognized formats: https://rdflib.readthedo
cs.io/en/stable/plugin_parsers.html#id1
-s, --distinct-concepts If enabled then when generate
`concepts_count` dict, where distinct URIs
in subject and object will be counted
(counters will reflect number of concepts).
Otherwise, it will generate
`subject_by_namespace_cnt` key where won't
distinguish URIs, so same URI will be
counted as many times as it is present in
triple on subject position (may be treated
as estimated number of concepts). Note:
Please don't by confused by the term
'concept', as it does not refers to
`skos:Concept` (has broader meaning).
[default: False]
-n, --known-namespaces-path TEXT
Path to JSON file with index of known RDF
namespaces to identify. Defaults to index
maintained on https://prefix.cc/ website.
--help Show this message and exit.
```
# Output
Generated JSON file contains following keys:
1. `triples_cnt`: number of triples in graph,
1. `predicates_cnt`: dict with predicates and number of occurences,
1. `known_ns_predicates_cnt`: dict with number of detected predicates belonging
to one of the known namespaces (specified under `--known-namespaces-path` option),
1. `subject_by_namespace_cnt`: number of subjects (URIs representing concepts)
from detected namespaces (schemas); note: results may by inaccurate, as
recognition of namespaces is done by using simple heuristic (base URI ends at
last `#` or '\/' character); also, this don't really distinguish URIs, so
same URI will be counted as many times as it is present in triple on subject
position,
1. `concepts_count`: number of concepts (URIs on subject or object position; in
RDF terms: URIs set by `rdf:about`, `rdf:ID`, `rdf:resource`)
from detected namespaces (schemas); note: results may by inaccurate, as
recognition of namespaces is done by using simple heuristic (base URI ends at
last `#` or '\/' character); In contrast to `subject_by_namespace_cnt`, it
distinguishes URIs. Will be generated if `--distinct-concepts ` flag enabled.
1. `object_type_by_predicate`: dict of predicates, with list of types of objects
associated with them; type of information includes type of RDF entity and
type of value (in case of literals),
1. `text_literals_langs_cnt_by_predicate`: dict of dicts, storing number of
occurences of literals (only for text literals) in different languages;
includes information about associated predicate,
1. `triple_examples_by_predicate`: dict storing single example of triple for
every predicate in the graph.
#!/bin/bash
wget https://prefix.cc/popular/all.file.json -O known_namespaces.json
This diff is collapsed.
#!/usr/bin/env python3
"""
ClI script for displaying information about RDF graph content.
It can be useful when defining new conversion config for new resource.
"""
from collections import defaultdict, OrderedDict
from typing import Optional, Union
import click
import json
import rdflib
def load_rdf_graph(
rdf_graph_spec: Union[str, rdflib.Graph], format: Optional[str] = None
):
if isinstance(rdf_graph_spec, rdflib.Graph):
return rdf_graph_spec
graph = rdflib.Graph()
if not format:
format = rdflib.util.guess_format(rdf_graph_spec)
graph.parse(rdf_graph_spec, format=format)
return graph
def _json_set_as_list(obj):
"""
Converts sets to lists in order to serialize as JSON.
"""
if isinstance(obj, set):
return list(obj)
return obj
def _o_type_spec(o):
if isinstance(o, rdflib.URIRef):
return "URI"
elif isinstance(o, rdflib.BNode):
return "BNode"
elif isinstance(o, rdflib.Literal):
l_type = o.datatype if o.datatype else "string"
return f"Literal:{l_type}"
else:
raise TypeError(f"Unknown object type: {type(o)}")
def _guess_base_uri(uri):
"""
Tries to extract base URI. In case of failure, returns "Unknown".
Extracted base URI contains original separator character at the end.
"""
idx = uri.rfind("#")
if idx == -1:
idx = uri.rfind("/")
if idx == -1:
return "Unknown"
ns = uri[: idx + 1]
if not ns:
return "Unknown"
return ns
def _rdf_info(graph, distinct_concepts, known_namespaces=None):
t_cnt = 0
p_cnt_dict = defaultdict(int)
kp_cnt_dict = defaultdict(int)
text_p_lang_cnt_dict = defaultdict(lambda: defaultdict(int))
p_o_type_dict = defaultdict(set)
uri_by_namespace_cnt = defaultdict(int)
triple_by_p_ex_dict = {}
dist_uris = set()
for s, p, o in graph:
p_cnt_dict[str(p)] += 1
if p_cnt_dict[str(p)] == 1:
triple_by_p_ex_dict[str(p)] = (str(s), str(p), str(o))
if known_namespaces:
p_str = str(p)
bp = _guess_base_uri(p_str)
if bp in known_namespaces:
kp_cnt_dict[known_namespaces[bp]] += 1
o_type = _o_type_spec(o)
p_o_type_dict[str(p)].add(o_type)
if o_type == "Literal:string":
lang = o.language if o.language else "Unspecified"
text_p_lang_cnt_dict[str(p)][lang] += 1
if isinstance(s, rdflib.URIRef):
s_str = str(s)
if distinct_concepts:
if s_str not in dist_uris:
uri_by_namespace_cnt[_guess_base_uri(s_str)] += 1
dist_uris.add(s_str)
if o_type == "URI":
o_str = str(o)
if o_str not in dist_uris:
uri_by_namespace_cnt[_guess_base_uri(o_str)] += 1
dist_uris.add(o_str)
else:
uri_by_namespace_cnt[_guess_base_uri(s_str)] += 1
t_cnt += 1
p_cnt_dict = OrderedDict(
sorted(p_cnt_dict.items(), key=lambda t: t[1], reverse=True)
)
uri_by_namespace_cnt = OrderedDict(
sorted(uri_by_namespace_cnt.items(), key=lambda t: t[1], reverse=True)
)
ordered_text_p_lang_cnt_dict = {}
for pred, d in text_p_lang_cnt_dict.items():
ordered_text_p_lang_cnt_dict[pred] = OrderedDict(
sorted(d.items(), key=lambda t: t[1], reverse=True)
)
p_o_type_dict = OrderedDict(sorted(p_o_type_dict.items(), key=lambda t: t[0]))
triple_by_p_ex_dict = OrderedDict(
sorted(triple_by_p_ex_dict.items(), key=lambda t: t[0])
)
subj_by_ns_cnt_key = (
"concepts_count" if distinct_concepts else "subject_by_namespace_cnt"
)
info = {
"triples_cnt": t_cnt,
"predicates_cnt": p_cnt_dict,
subj_by_ns_cnt_key: uri_by_namespace_cnt,
"object_type_by_predicate": p_o_type_dict,
"text_literals_langs_cnt_by_predicate": ordered_text_p_lang_cnt_dict,
"triple_examples_by_predicate": triple_by_p_ex_dict,
}
if known_namespaces:
kp_cnt_dict = OrderedDict(
sorted(kp_cnt_dict.items(), key=lambda t: t[1], reverse=True)
)
info["known_ns_predicates_cnt"] = kp_cnt_dict
return info
@click.command()
@click.argument("graph")
@click.argument("out")
@click.option(
"--graph-format",
"-f",
help="""
Format of the input graph file. Don't need to pass unless converter
is not able to guess the format correctly. List of recognized formats:
https://rdflib.readthedocs.io/en/stable/plugin_parsers.html#id1
""",
)
@click.option(
"-s",
"--distinct-concepts",
default=False,
show_default=True,
is_flag=True,
help="""
If enabled then when generate `concepts_count` dict, where
distinct URIs in subject and object
will be counted (counters will reflect number of concepts). Otherwise, it
will generate `subject_by_namespace_cnt` key where
won't distinguish URIs, so same URI will be counted as many times as it is
present in triple on subject position (may be treated as estimated number of
concepts).
Note: Please don't by confused by the term 'concept', as it does not refers
to `skos:Concept` (has broader meaning).
""",
)
@click.option(
"-n",
"--known-namespaces-path",
default="known_namespaces.json",
help="""
Path to JSON file with index of known RDF namespaces to identify.
Defaults to index maintained on https://prefix.cc/ website.
""",
)
def cli(
graph,
out,
graph_format,
distinct_concepts,
known_namespaces_path,
):
"""
ClI script for displaying information about RDF graph content.
It can be useful when defining new conversion config for new resource.
\b
Arguments:
GRAPH: Path to RDF graph to convert.
OUT: Path for output *.json file.
"""
graph = load_rdf_graph(graph, graph_format)
with open(known_namespaces_path, "r") as ifile:
known_namespaces = json.load(ifile)
known_namespaces = {
v: k for k, v in known_namespaces.items()
} # we need reversed index
info = _rdf_info(graph, distinct_concepts, known_namespaces=known_namespaces)
with open(out, "w") as ofile:
json.dump(info, ofile, indent=4, default=_json_set_as_list)
if __name__ == "__main__":
cli()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment