Frames

Untitled

0
1#!/usr/bin/env python3
2#author: Harshvardhan J. Pandit
3
4'''Take CSV and generate RDF from it'''
5
6########################################
7# How to read and understand this file #
8# 1. Start from the end of the file
9# 2. This script reads CSV files explicitly declared
10# 3. It generates RDF terms using rdflib for Classes and Properties
11# 4. It writes those terms to a file - one per each module
12# 5. It combines all written files into dpv.ttl and dpv-gdpr.ttl
13
14# This script assumes the input if well structured and formatted
15# If it isn't, the 'erors' may silently propogate
16
17# CSV FILES are in IMPORT_CSV_PATH
18# RDF FILES are written to EXPORT_DPV_MODULE_PATH
19########################################
20
21IMPORT_CSV_PATH = './vocab_csv'
22EXPORT_DPV_PATH = './vocab_dpv'
23EXPORT_DPV_MODULE_PATH = './vocab_dpv/modules'
24EXPORT_DPV_GDPR_PATH = './vocab_dpv_gdpr'
25
26# serializations in the form of extention: rdflib name
27RDF_SERIALIZATIONS = {
28 'rdf': 'xml',
29 'ttl': 'turtle',
30 'n3': 'n3',
31 'jsonld': 'json-ld'
32 }
33
34import csv
35from collections import namedtuple
36
37from rdflib import Graph, Namespace
38from rdflib.namespace import XSD
39from rdflib import RDF, RDFS, OWL
40from rdflib.term import Literal, URIRef, BNode
41
42import logging
43# logging configuration for debugging to console
44logging.basicConfig(
45 level=logging.DEBUG, format='%(levelname)s - %(funcName)s :: %(lineno)d - %(message)s')
46DEBUG = logging.debug
47INFO = logging.info
48
49DCT = Namespace('http://purl.org/dc/terms/')
50DPV = Namespace('http://www.w3.org/ns/dpv#')
51DPV_GDPR = Namespace('http://www.w3.org/ns/dpv-gdpr#')
52FOAF = Namespace('http://xmlns.com/foaf/0.1/')
53ODRL = Namespace('http://www.w3.org/ns/odrl/2/')
54PROV = Namespace('http://www.w3.org/ns/prov#')
55SKOS = Namespace('http://www.w3.org/2004/02/skos/core#')
56SPL = Namespace('http://www.specialprivacy.eu/langs/usage-policy#')
57SVD = Namespace('http://www.specialprivacy.eu/vocabs/data#')
58SVDU = Namespace('http://www.specialprivacy.eu/vocabs/duration#')
59SVL = Namespace('http://www.specialprivacy.eu/vocabs/locations#')
60SVPR = Namespace('http://www.specialprivacy.eu/vocabs/processing#')
61SVPU = Namespace('http://www.specialprivacy.eu/vocabs/purposes#')
62SVR = Namespace('http://www.specialprivacy.eu/vocabs/recipients')
63SW = Namespace('http://www.w3.org/2003/06/sw-vocab-status/ns#')
64TIME = Namespace('http://www.w3.org/2006/time#')
65
66# The dpv namespace is the default base for all terms
67# Later, this is changed to write terms under DPV-GDPR namespace
68BASE = DPV
69
70NAMESPACES = {
71 'dct': DCT,
72 'dpv': DPV,
73 'dpv-gdpr': DPV_GDPR,
74 'foaf': FOAF,
75 'odrl': ODRL,
76 'owl': OWL,
77 'prov': PROV,
78 'rdf': RDF,
79 'rdfs': RDFS,
80 'skos': SKOS,
81 'spl': SPL,
82 'svd': SVD,
83 'svdu': SVDU,
84 'svl': SVL,
85 'svpr': SVPR,
86 'svpu': SVPU,
87 'svr': SVR,
88 'sw': SW,
89 'time': TIME,
90 'xsd': XSD,
91}
92
93# the field labels are based on what they should be translated to
94
95DPV_Class = namedtuple('DPV_Class', [
96 'term', 'rdfs_label', 'dct_description', 'rdfs_subclassof',
97 'rdfs_seealso', 'relation', 'rdfs_comment', 'rdfs_isdefinedby',
98 'dct_created', 'sw_termstatus', 'dct_creator',
99 'dct_dateaccepted', 'resolution'])
100
101DPV_Property = namedtuple('DPV_Property', [
102 'term', 'rdfs_label', 'dct_description',
103 'rdfs_domain', 'rdfs_range', 'rdfs_subpropertyof',
104 'rdfs_seealso', 'relation', 'rdfs_comment', 'rdfs_isdefinedby',
105 'dct_created', 'sw_termstatus', 'dct_creator',
106 'dct_dateaccepted', 'resolution'])
107
108
109def extract_terms_from_csv(filepath, Class):
110 '''extracts data from file.csv and creates instances of Class
111 returns list of Class instances'''
112 # this is a hack to get parseable number of fields from CSV
113 # it relies on the internal data structure of a namedtuple
114 attributes = Class.__dict__
115 attributes = len(attributes['_fields'])
116 with open(filepath) as fd:
117 csvreader = csv.reader(fd)
118 next(csvreader)
119 terms = []
120 for row in csvreader:
121 # skip empty rows
122 if not row[0].strip():
123 continue
124 # extract required amount of terms, ignore any field after that
125 row = [term.strip() for term in row[:attributes]]
126 # create instance of required class
127 terms.append(Class(*row))
128
129 return terms
130
131
132def add_common_triples_for_all_terms(term, graph):
133 '''Adds triples for any term to graph
134 Common triples are those shared by Class and Property
135 terms: data structure of term; is object with attributes
136 graph: rdflib graph
137 returns: None'''
138 # rdfs:label
139 graph.add((BASE[f'{term.term}'], RDFS.label, Literal(term.rdfs_label, lang='en')))
140 # dct:description
141 graph.add((BASE[f'{term.term}'], DCT.description, Literal(term.dct_description, lang='en')))
142 # rdfs:seeAlso
143 # TODO: use relation field for relevant terms
144 # currently this considers all terms that are related to use rdfs:seeAlso
145 # the next column contains the relation, parse and use that
146 if term.rdfs_seealso:
147 links = [l.strip() for l in term.rdfs_seealso.split(',')]
148 for link in links:
149 if link.startswith('http'):
150 graph.add((BASE[f'{term.term}'], RDFS.seeAlso, URIRef(link)))
151 elif ':' in link:
152 # assuming something like rdfs:Resource
153 prefix, label = link.split(':')
154 # gets the namespace from registered ones and create URI
155 # will throw an error if namespace is not registered
156 # dpv internal terms are expected to have the prefix i.e. dpv:term
157 link = NAMESPACES[prefix][f'{label}']
158 graph.add((BASE[f'{term.term}'], RDFS.seeAlso, link))
159 else:
160 graph.add((BASE[f'{term.term}'], RDFS.seeAlso, Literal(link, datatype=XSD.string)))
161 # rdfs:comment
162 if term.rdfs_comment:
163 graph.add((BASE[f'{term.term}'], RDFS.comment, Literal(term.rdfs_comment, lang='en')))
164 # rdfs:isDefinedBy
165 if term.rdfs_isdefinedby:
166 links = [l.strip() for l in term.rdfs_isdefinedby.split(',')]
167 for link in links:
168 if link.startswith('http'):
169 graph.add((BASE[f'{term.term}'], RDFS.isDefinedBy, URIRef(link)))
170 else:
171 graph.add((BASE[f'{term.term}'], RDFS.isDefinedBy, Literal(link, datatype=XSD.string)))
172 # dct:created
173 graph.add((BASE[f'{term.term}'], DCT.created, Literal(term.dct_created, datatype=XSD.date)))
174 # sw:term_status
175 graph.add((BASE[f'{term.term}'], SW.term_status, Literal(term.sw_termstatus, lang='en')))
176 # dct:creator
177 if term.dct_creator:
178 authors = [a.strip() for a in term.dct_creator.split(',')]
179 for author in authors:
180 graph.add((BASE[f'{term.term}'], DCT.creator, Literal(author, datatype=XSD.string)))
181 # dct:date-accepted
182 if term.dct_dateaccepted:
183 graph.add((BASE[f'{term.term}'], DCT['date-accepted'], Literal(term.dct_dateaccepted, datatype=XSD.date)))
184 # resolution
185 # do nothing
186
187 return None
188
189
190def add_triples_for_classes(classes, graph):
191 '''Adds triples for classes to graph
192 classes: list of CSV data rows
193 graph: rdflib graph
194 returns: None'''
195
196 for cls in classes:
197 # only add accepted classes
198 # if cls.sw_termstatus != "accepted":
199 # continue
200 # rdf:type
201 graph.add((BASE[f'{cls.term}'], RDF.type, RDFS.Class))
202 # rdfs:subClassOf
203 if cls.rdfs_subclassof:
204 parents = [p.strip() for p in cls.rdfs_subclassof.split(',')]
205 for parent in parents:
206 if parent.startswith('http'):
207 graph.add((BASE[f'{cls.term}'], RDFS.subClassOf, URIRef(parent)))
208 elif ':' in parent:
209 # assuming something like rdfs:Resource
210 prefix, term = parent.split(':')
211 # gets the namespace from registered ones and create URI
212 # will throw an error if namespace is not registered
213 # dpv internal terms are expected to have the prefix i.e. dpv:term
214 parent = NAMESPACES[prefix][f'{term}']
215 graph.add((BASE[f'{cls.term}'], RDFS.subClassOf, parent))
216 else:
217 graph.add((BASE[f'{cls.term}'], RDFS.subClassOf, Literal(parent, datatype=XSD.string)))
218
219 add_common_triples_for_all_terms(cls, graph)
220
221 return None
222
223
224def add_triples_for_properties(properties, graph):
225 '''Adds triples for properties to graph
226 properties: list of CSV data rows
227 graph: rdflib graph
228 returns: None'''
229
230 for prop in properties:
231 # only record accepted classes
232 if prop.sw_termstatus != "accepted":
233 continue
234 # rdf:type
235 graph.add((BASE[f'{prop.term}'], RDF.type, RDF.Property))
236 # rdfs:domain
237 if prop.rdfs_domain:
238 # assuming something like rdfs:Resource
239 prefix, label = prop.rdfs_domain.split(':')
240 # gets the namespace from registered ones and create URI
241 # will throw an error if namespace is not registered
242 # dpv internal terms are expected to have the prefix i.e. dpv:term
243 link = NAMESPACES[prefix][f'{label}']
244 graph.add((BASE[f'{prop.term}'], RDFS.domain, link))
245 # rdfs:range
246 if prop.rdfs_range:
247 # assuming something like rdfs:Resource
248 prefix, label = prop.rdfs_range.split(':')
249 # gets the namespace from registered ones and create URI
250 # will throw an error if namespace is not registered
251 # dpv internal terms are expected to have the prefix i.e. dpv:term
252 link = NAMESPACES[prefix][f'{label}']
253 graph.add((BASE[f'{prop.term}'], RDFS.range, link))
254 # rdfs:subPropertyOf
255 if prop.rdfs_subpropertyof:
256 parents = [p.strip() for p in prop.rdfs_subpropertyof.split(',')]
257 for parent in parents:
258 if parent.startswith('http'):
259 graph.add((BASE[f'{prop.term}'], RDFS.subPropertyOf, URIRef(parent)))
260 elif ':' in parent:
261 # assuming something like rdfs:Resource
262 prefix, term = parent.split(':')
263 # gets the namespace from registered ones and create URI
264 # will throw an error if namespace is not registered
265 # dpv internal terms are expected to have the prefix i.e. dpv:term
266 parent = NAMESPACES[prefix][f'{term}']
267 graph.add((BASE[f'{prop.term}'], RDFS.subPropertyOf, parent))
268 else:
269 graph.add((BASE[f'{prop.term}'], RDFS.subPropertyOf, Literal(parent, datatype=XSD.string)))
270 add_common_triples_for_all_terms(prop, graph)
271
272
273def serialize_graph(graph, filepath):
274 '''serializes given graph at filepath with defined formats'''
275 for ext, format in RDF_SERIALIZATIONS.items():
276 graph.serialize(f'{filepath}.{ext}', format=format)
277 INFO(f'wrote {filepath}.{ext}')
278
279
280# #############################################################################
281
282# DPV #
283
284DPV_CSV_FILES = {
285 'base': {
286 'classes': f'{IMPORT_CSV_PATH}/BaseOntology.csv',
287 'properties': f'{IMPORT_CSV_PATH}/BaseOntology_properties.csv',
288 },
289 'personal_data_categories': {
290 'classes': f'{IMPORT_CSV_PATH}/PersonalDataCategory.csv',
291 },
292 'purposes': {
293 'classes': f'{IMPORT_CSV_PATH}/Purpose.csv',
294 'properties': f'{IMPORT_CSV_PATH}/Purpose_properties.csv',
295 },
296 'processing': {
297 'classes': f'{IMPORT_CSV_PATH}/Processing.csv',
298 'properties': f'{IMPORT_CSV_PATH}/Processing_properties.csv',
299 },
300 'technical_organisational_measures': {
301 'classes': f'{IMPORT_CSV_PATH}/TechnicalOrganisationalMeasure.csv',
302 'properties': f'{IMPORT_CSV_PATH}/TechnicalOrganisationalMeasure_properties.csv',
303 },
304 'entities': {
305 'classes': f'{IMPORT_CSV_PATH}/Entities.csv',
306 },
307 'consent': {
308 'classes': f'{IMPORT_CSV_PATH}/Consent.csv',
309 'properties': f'{IMPORT_CSV_PATH}/Consent_properties.csv',
310 },
311 }
312
313# this graph will get written to dpv.ttl
314DPV_GRAPH = Graph()
315
316for name, module in DPV_CSV_FILES.items():
317 graph = Graph()
318 for prefix, namespace in NAMESPACES.items():
319 graph.namespace_manager.bind(prefix, namespace)
320 if 'classes' in module:
321 classes = extract_terms_from_csv(module['classes'], DPV_Class)
322 DEBUG(f'there are {len(classes)} classes in {name}')
323 add_triples_for_classes(classes, graph)
324 if 'properties' in module:
325 properties = extract_terms_from_csv(module['properties'], DPV_Property)
326 DEBUG(f'there are {len(properties)} properties in {name}')
327 add_triples_for_properties(properties, graph)
328 serialize_graph(graph, f'{EXPORT_DPV_MODULE_PATH}/{name}')
329 DPV_GRAPH += graph
330
331# add information about ontology
332# this is assumed to be in file dpv-ontology-metadata.ttl
333graph = Graph()
334graph.load('dpv-ontology-metadata.ttl', format='turtle')
335DPV_GRAPH += graph
336
337for prefix, namespace in NAMESPACES.items():
338 DPV_GRAPH.namespace_manager.bind(prefix, namespace)
339serialize_graph(DPV_GRAPH, f'{EXPORT_DPV_PATH}/dpv')
340
341# DPV-GDPR #
342# dpv-gdpr is the exact same as dpv in terms of requirements and structure
343# except that the namespace is different
344# so instead of rewriting the entire code again for dpv-gdpr,
345# here I become lazy and instead change the DPV namespace to DPV-GDPR
346
347BASE = NAMESPACES['dpv-gdpr']
348
349graph = Graph()
350for prefix, namespace in NAMESPACES.items():
351 graph.namespace_manager.bind(prefix, namespace)
352classes = extract_terms_from_csv(f'{IMPORT_CSV_PATH}/LegalBasis.csv', DPV_Class)
353add_triples_for_classes(classes, graph)
354serialize_graph(graph, f'{EXPORT_DPV_GDPR_PATH}/dpv-gdpr')
355
356# #############################################################################
357