-
Notifications
You must be signed in to change notification settings - Fork 5
/
rdf.lisp
209 lines (180 loc) · 8.9 KB
/
rdf.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
(in-package :conllu.rdf)
;; NOTES: 1) we use Turtle mostly due to the easy way of specifying
;; containers and blank nodes. 2) there is a lot of duplicated code
;; generated. For example, all dependency relations are defined as
;; they are found (with a xx a conll:DepRel . xx rdfs:label foo .)
;; While we could solve this by first collecting all the unique values
;; first and then emitting those values at the end we can simply use
;; the "delete duplicates" function of any decent triple-store to
;; handle this at import time, thus simplifying the code.
;; from Wilbur2
(defun components->string (components)
(with-output-to-string (stream)
(dolist (component components)
(princ component stream))))
;; from Wilbur2
(defun escape-string (string char-escape-function)
;; This tries to be clever about stuff that does not need to be escaped
(labels ((escape (s n i parts)
(let ((j (position-if char-escape-function s :start i)))
(cond (j (escape s n (1+ j)
(list* (funcall char-escape-function (char s j))
(subseq s i j)
parts)))
(parts (components->string (nreverse (cons (subseq s i) parts))))
(t s)))))
(escape string (length string) 0 nil)))
;; from Wilbur2
(defun escape-turtle-char (char)
(cdr (assoc char '((#\\ . "\\\\")
(#\" . "\\\"")
(#\Linefeed . "\\n")
(#\Return . "\\r")
(#\Tab . "\\t"))
:test #'char=)))
(defun unspecified-field? (c)
(string-equal c "_"))
(defun make-literal (string)
(format nil "\"~a\"" (escape-string string #'escape-turtle-char)))
(defun make-token-id (sentence-id token-id)
(format nil "~at~a" sentence-id token-id))
(defun make-upos (upos)
(format nil "conll:~a" (string-capitalize upos)))
(defun make-dep (dep)
(format nil "conll:d~a" dep))
(defun make-featurename (feat)
(format nil "conll:f~a" (string-downcase feat)))
(defun make-metadata (metadata)
(with-output-to-string (s)
(mapcar (lambda (m)
(cond
((stringp (car m))
(format s "conll:m~a a conll:MetadataKey .~%" (string-downcase (car m)))
(format s "conll:m~a rdfs:label ~a .~%"
(cl-ppcre:regex-replace-all
" "
(string-downcase (car m)) "_")
(make-literal (car m))))
((equal (cdr m) :none)
(format s "conll:m~a a conll:MetadataValue .~%"
(cl-ppcre:regex-replace-all
" "
(string-downcase (cdr m)) "_"))
(format s "conll:m~a rdfs:label ~a .~%"
(cl-ppcre:regex-replace-all
" "
(string-downcase (cdr m)) "_")
(make-literal (cdr m))))
(t
(error "Indetermined metadata case."))))
metadata)
s))
(defun make-metadata-bnode (metadata)
(format nil "~{ ~a~^;~}"
(mapcar (lambda (m)
(cond
((stringp (car m))
(format nil "conll:m~a ~a"
(cl-ppcre:regex-replace-all
" "
(string-downcase (car m))
"_")
(make-literal (cdr m))))
((eq (cdr m) :none)
(format nil "a conll:m~a"
(cl-ppcre:regex-replace-all
" "
(string-downcase (cdr m)) "_")))))
metadata)))
(defun make-features (features &optional (value-as-literal nil))
(unless (unspecified-field? features)
(with-output-to-string (s)
(mapc (lambda (f)
(destructuring-bind (name value) (split-sequence #\= f :count 2)
(format s "~a a conll:FeatureName . ~%" (make-featurename name))
(format s "~a rdfs:label ~a . ~%" (make-featurename name) (make-literal name))
(unless value-as-literal
(format s "conll:~a a conll:FeatureValue .~%" value)
(format s "conll:~a rdfs:label ~a .~%" value (make-literal value)))))
(split-sequence #\| features :remove-empty-subseqs t))
s)))
(defun make-features-bnode (features &optional (value-as-literal nil))
(labels ((fmt (name value value-as-literal)
(if value-as-literal
(format nil "~a ~a" (make-featurename name) (make-literal value))
(format nil "~a conll:~a" (make-featurename name) value))))
(unless (unspecified-field? features)
(format nil "~{ ~a~^;~}"
(mapcar (lambda (f)
(let ((alist (split-sequence #\= f :count 2)))
(if (> (length alist) 1)
(fmt (car alist) (cadr alist) value-as-literal)
(fmt "other" (car alist) value-as-literal))))
(split-sequence #\| features :remove-empty-subseqs t))))))
(defun convert-sentence-to-turtle (stream conll text id)
(format stream "conll:root a conll:DepRel .~%")
(format stream "conll:root rdfs:label ~a .~%" (make-literal "root"))
(format stream "conll:~a a conll:Sentence .~%" id)
(format stream "conll:~a rdfs:label ~a .~%" id (make-literal text))
(format stream "conll:~a conll:tokens (~{~a~^ ~}) .~%" id
(mapcar (lambda (tk) (format nil "conll:~a" (make-token-id id (slot-value tk 'id)))) (sentence-tokens conll)))
(format stream "~a~%" (make-metadata (sentence-meta conll)))
(format stream "conll:~a conll:metadata [ ~a ] .~%" id (make-metadata-bnode (sentence-meta conll)))
(dolist (tk (sentence-tokens conll))
(let ((tid (make-token-id id (slot-value tk 'id)))
(form (slot-value tk 'form))
(lemma (slot-value tk 'lemma))
(upostag (slot-value tk 'upostag))
(xpostag (slot-value tk 'xpostag))
(feats (make-features-bnode (slot-value tk 'feats) t))
(head (make-token-id id (slot-value tk 'head)))
(deprel (slot-value tk 'deprel))
(deps (slot-value tk 'deps))
(misc (make-features-bnode (slot-value tk 'misc) t)))
(format stream "conll:~a a conll:Token .~%" tid)
(format stream "conll:~a conll:sentence conll:~a .~%" tid id)
(format stream "conll:~a conll:form ~a .~%" tid (make-literal form))
(format stream "conll:~a conll:lemma ~a .~%" tid (make-literal lemma))
(format stream "conll:~a conll:upos ~a .~%" tid (make-upos upostag))
(format stream "~a a conll:UPostag .~%" (make-upos upostag))
(format stream "~a rdfs:label ~a .~%" (make-upos upostag) (make-literal upostag))
;; we append -raw to these as we don't have a good way of
;; converting them to "proper" RDF
(unless (unspecified-field? xpostag)
(format stream "conll:~a conll:xpos-raw ~a .~%" tid (make-literal xpostag)))
(unless (unspecified-field? deps)
(format stream "conll:~a conll:deps-raw ~a .~%" tid (make-literal deps)))
(when feats
;; (format stream "~a" (make-features (slot-value tk 'feats)))
(format stream "conll:~a conll:features [ ~a ] .~%" tid feats))
(when misc
;; (format stream "~a" (make-features (slot-value tk 'misc)))
(format stream "conll:~a conll:misc [ ~a ] .~%" tid misc)
;; (format stream "conll:~a conll:misc ~a .~%" tid (make-literal misc))
)
(if (string-equal "root" deprel)
(format stream "conll:~a conll:root conll:~a .~%" id tid)
(progn
(format stream "~a a conll:DepRel .~%" (make-dep deprel))
(format stream "~a rdfs:label ~a .~%" (make-dep deprel) (make-literal deprel))
(format stream "conll:~a ~a conll:~a .~%" tid (make-dep deprel) head)
(format stream "conll:~a ~a conll:~a .~%" tid (make-dep deprel) head))))))
(defun convert-rdf (corpusname stream conlls text-fn id-fn)
"Converts the collection of sentences (as generated by READ-CONLLU)
in CONLL, using the function TEXT-FN to extract the text of each
sentence and ID-FN to extract the id of each sentence (we need this
as there is no standardized way of knowing this.) Also the
generated Turtle file contains a lot of duplication so when you
import it into your triple-store, make sure you remove all
duplicate triples afterwards."
(format stream "@prefix conll: <http://br.ibm.com/conll/> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix dc: <http://purl.org/dc/elements/1.1/> . @prefix dcterms: <http://purl.org/dc/terms/> . @prefix skos: <http://www.w3.org/2004/02/skos/core#> . @prefix owl: <http://www.w3.org/2002/07/owl#> .~%")
(let ((corpus-id (uuid:make-v4-uuid)))
(format stream "conll:c~a a conll:Corpus .~%" corpus-id)
(format stream "conll:c~a rdfs:label \"~a\" .~%" corpus-id corpusname)
(format stream "conll:c~a conll:sentences (~{~a~^ ~}) . ~%" corpus-id (mapcar (lambda (c) (format nil "conll:~a" (funcall id-fn c))) conlls))
(dolist (c conlls)
(format stream "conll:~a conll:corpus conll:c~a.~%" (funcall id-fn c) corpus-id)
(convert-sentence-to-turtle stream c (funcall text-fn c) (funcall id-fn c)))))
(defun convert-rdf-file (file-in file-out)
(with-open-file (stream file-out :direction :output :if-exists :supersede)
(convert-rdf (pathname-name file-in) stream (read-conllu file-in) #'sentence-text #'sentence-id)))