Init commit

6b1d7e08 · Grzegorz Kostkowski · 6b1d7e08 · 6b1d7e08 · 6b1d7e08 · 6b1d7e08
Commit 6b1d7e08 authored 3 years ago by Grzegorz Kostkowski
--- a/README.md
+++ b/README.md
+# Overview
+ClI script for displaying information about RDF graph content.
+It can be useful when defining new conversion config for new resource.
+
+# Usage
+```
+Usage: rdf-info [OPTIONS] GRAPH OUT
+
+  ClI script for displaying information about RDF graph content. It can be
+  useful when defining new conversion config for new resource.
+
+  Arguments:
+      GRAPH: Path to RDF graph to convert.
+      OUT: Path for output *.json file.
+
+Options:
+  -f, --graph-format TEXT         Format of the input graph file. Don't need
+                                  to pass unless converter is not able to
+                                  guess the format correctly. List of
+                                  recognized formats: https://rdflib.readthedo
+                                  cs.io/en/stable/plugin_parsers.html#id1
+  -s, --distinct-concepts         If enabled then when generate
+                                  `concepts_count` dict, where distinct URIs
+                                  in subject and object will be counted
+                                  (counters will reflect number of concepts).
+                                  Otherwise, it will generate
+                                  `subject_by_namespace_cnt` key where won't
+                                  distinguish URIs, so same URI will be
+                                  counted as many times as it is present in
+                                  triple on subject position (may be treated
+                                  as estimated number of concepts). Note:
+                                  Please don't by confused by the term
+                                  'concept', as it does not refers to
+                                  `skos:Concept` (has broader meaning).
+                                  [default: False]
+  -n, --known-namespaces-path TEXT
+                                  Path to JSON file with index of known RDF
+                                  namespaces to identify. Defaults to index
+                                  maintained on https://prefix.cc/ website.
+  --help                          Show this message and exit.
+```
+
+
+# Output
+Generated JSON file contains following keys:
+
+1. `triples_cnt`: number of triples in graph,
+1. `predicates_cnt`: dict with predicates and number of occurences,
+1. `known_ns_predicates_cnt`: dict with number of detected predicates belonging
+   to one of the known namespaces (specified under `--known-namespaces-path` option),
+1. `subject_by_namespace_cnt`: number of subjects (URIs representing concepts)
+   from detected namespaces (schemas); note: results may by inaccurate, as
+   recognition of namespaces is done by using simple heuristic (base URI ends at
+   last `#` or '\/' character); also, this don't really distinguish URIs, so
+   same URI will be counted as many times as it is present in triple on subject
+   position,
+1. `concepts_count`: number of concepts (URIs on subject or object position; in
+   RDF terms: URIs set by `rdf:about`, `rdf:ID`, `rdf:resource`)
+   from detected namespaces (schemas); note: results may by inaccurate, as
+   recognition of namespaces is done by using simple heuristic (base URI ends at
+   last `#` or '\/' character); In contrast to `subject_by_namespace_cnt`, it
+   distinguishes URIs. Will be generated if `--distinct-concepts ` flag enabled.
+1. `object_type_by_predicate`: dict of predicates, with list of types of objects
+    associated with them; type of information includes type of RDF entity and
+    type of value (in case of literals),
+1. `text_literals_langs_cnt_by_predicate`: dict of dicts, storing number of
+   occurences of literals (only for text literals) in different languages;
+   includes information about associated predicate,
+1. `triple_examples_by_predicate`: dict storing single example of triple for
+   every predicate in the graph.
--- a/get_latest_schemas_dump.sh
+++ b/get_latest_schemas_dump.sh
+#!/bin/bash
+
+wget https://prefix.cc/popular/all.file.json -O known_namespaces.json
--- a/known_namespaces.json
+++ b/known_namespaces.json
--- a/rdf-info
+++ b/rdf-info
+#!/usr/bin/env python3
+
+"""
+ClI script for displaying information about RDF graph content.
+It can be useful when defining new conversion config for new resource.
+"""
+from collections import defaultdict, OrderedDict
+from typing import Optional, Union
+
+import click
+import json
+import rdflib
+
+
+def load_rdf_graph(
+    rdf_graph_spec: Union[str, rdflib.Graph], format: Optional[str] = None
+):
+    if isinstance(rdf_graph_spec, rdflib.Graph):
+        return rdf_graph_spec
+    graph = rdflib.Graph()
+    if not format:
+        format = rdflib.util.guess_format(rdf_graph_spec)
+    graph.parse(rdf_graph_spec, format=format)
+    return graph
+
+
+def _json_set_as_list(obj):
+    """
+    Converts sets to lists in order to serialize as JSON.
+    """
+    if isinstance(obj, set):
+        return list(obj)
+    return obj
+
+
+def _o_type_spec(o):
+    if isinstance(o, rdflib.URIRef):
+        return "URI"
+    elif isinstance(o, rdflib.BNode):
+        return "BNode"
+    elif isinstance(o, rdflib.Literal):
+        l_type = o.datatype if o.datatype else "string"
+        return f"Literal:{l_type}"
+    else:
+        raise TypeError(f"Unknown object type: {type(o)}")
+
+
+def _guess_base_uri(uri):
+    """
+    Tries to extract base URI. In case of failure, returns "Unknown".
+    Extracted base URI contains original separator character at the end.
+    """
+    idx = uri.rfind("#")
+    if idx == -1:
+        idx = uri.rfind("/")
+        if idx == -1:
+            return "Unknown"
+    ns = uri[: idx + 1]
+    if not ns:
+        return "Unknown"
+    return ns
+
+
+def _rdf_info(graph, distinct_concepts, known_namespaces=None):
+    t_cnt = 0
+    p_cnt_dict = defaultdict(int)
+    kp_cnt_dict = defaultdict(int)
+    text_p_lang_cnt_dict = defaultdict(lambda: defaultdict(int))
+    p_o_type_dict = defaultdict(set)
+    uri_by_namespace_cnt = defaultdict(int)
+    triple_by_p_ex_dict = {}
+    dist_uris = set()
+    for s, p, o in graph:
+        p_cnt_dict[str(p)] += 1
+        if p_cnt_dict[str(p)] == 1:
+            triple_by_p_ex_dict[str(p)] = (str(s), str(p), str(o))
+        if known_namespaces:
+            p_str = str(p)
+            bp = _guess_base_uri(p_str)
+            if bp in known_namespaces:
+                kp_cnt_dict[known_namespaces[bp]] += 1
+        o_type = _o_type_spec(o)
+        p_o_type_dict[str(p)].add(o_type)
+        if o_type == "Literal:string":
+            lang = o.language if o.language else "Unspecified"
+            text_p_lang_cnt_dict[str(p)][lang] += 1
+        if isinstance(s, rdflib.URIRef):
+            s_str = str(s)
+            if distinct_concepts:
+                if s_str not in dist_uris:
+                    uri_by_namespace_cnt[_guess_base_uri(s_str)] += 1
+                    dist_uris.add(s_str)
+                if o_type == "URI":
+                    o_str = str(o)
+                    if o_str not in dist_uris:
+                        uri_by_namespace_cnt[_guess_base_uri(o_str)] += 1
+                        dist_uris.add(o_str)
+            else:
+                uri_by_namespace_cnt[_guess_base_uri(s_str)] += 1
+        t_cnt += 1
+    p_cnt_dict = OrderedDict(
+        sorted(p_cnt_dict.items(), key=lambda t: t[1], reverse=True)
+    )
+    uri_by_namespace_cnt = OrderedDict(
+        sorted(uri_by_namespace_cnt.items(), key=lambda t: t[1], reverse=True)
+    )
+    ordered_text_p_lang_cnt_dict = {}
+    for pred, d in text_p_lang_cnt_dict.items():
+        ordered_text_p_lang_cnt_dict[pred] = OrderedDict(
+            sorted(d.items(), key=lambda t: t[1], reverse=True)
+        )
+    p_o_type_dict = OrderedDict(sorted(p_o_type_dict.items(), key=lambda t: t[0]))
+    triple_by_p_ex_dict = OrderedDict(
+        sorted(triple_by_p_ex_dict.items(), key=lambda t: t[0])
+    )
+    subj_by_ns_cnt_key = (
+        "concepts_count" if distinct_concepts else "subject_by_namespace_cnt"
+    )
+    info = {
+        "triples_cnt": t_cnt,
+        "predicates_cnt": p_cnt_dict,
+        subj_by_ns_cnt_key: uri_by_namespace_cnt,
+        "object_type_by_predicate": p_o_type_dict,
+        "text_literals_langs_cnt_by_predicate": ordered_text_p_lang_cnt_dict,
+        "triple_examples_by_predicate": triple_by_p_ex_dict,
+    }
+    if known_namespaces:
+        kp_cnt_dict = OrderedDict(
+            sorted(kp_cnt_dict.items(), key=lambda t: t[1], reverse=True)
+        )
+        info["known_ns_predicates_cnt"] = kp_cnt_dict
+    return info
+
+
+@click.command()
+@click.argument("graph")
+@click.argument("out")
+@click.option(
+    "--graph-format",
+    "-f",
+    help="""
+    Format of the input graph file. Don't need to pass unless converter
+    is not able to guess the format correctly. List of recognized formats:
+    https://rdflib.readthedocs.io/en/stable/plugin_parsers.html#id1
+    """,
+)
+@click.option(
+    "-s",
+    "--distinct-concepts",
+    default=False,
+    show_default=True,
+    is_flag=True,
+    help="""
+    If enabled then when generate `concepts_count` dict, where
+    distinct URIs in subject and object
+    will be counted (counters will reflect number of concepts). Otherwise, it
+    will generate `subject_by_namespace_cnt` key where
+    won't distinguish URIs, so same URI will be counted as many times as it is
+    present in triple on subject position (may be treated as estimated number of
+    concepts).
+    Note: Please don't by confused by the term 'concept', as it does not refers
+    to `skos:Concept` (has broader meaning).
+    """,
+)
+@click.option(
+    "-n",
+    "--known-namespaces-path",
+    default="known_namespaces.json",
+    help="""
+    Path to JSON file with index of known RDF namespaces to identify.
+    Defaults to index maintained on https://prefix.cc/ website.
+    """,
+)
+def cli(
+    graph,
+    out,
+    graph_format,
+    distinct_concepts,
+    known_namespaces_path,
+):
+    """
+    ClI script for displaying information about RDF graph content.
+    It can be useful when defining new conversion config for new resource.
+
+    \b
+    Arguments:
+        GRAPH: Path to RDF graph to convert.
+        OUT: Path for output *.json file.
+    """
+    graph = load_rdf_graph(graph, graph_format)
+    with open(known_namespaces_path, "r") as ifile:
+        known_namespaces = json.load(ifile)
+    known_namespaces = {
+        v: k for k, v in known_namespaces.items()
+    }  # we need reversed index
+    info = _rdf_info(graph, distinct_concepts, known_namespaces=known_namespaces)
+    with open(out, "w") as ofile:
+        json.dump(info, ofile, indent=4, default=_json_set_as_list)
+
+
+if __name__ == "__main__":
+    cli()