root/tests/rdfa2rdfxml.py

Revision 8adf69fff952d069e4f6da472d9d8e0acf8fea28, 9.4 kB (checked in by Manu Sporny <msporny@…>, 22 months ago)

Fixed the last remaining Test Case bug that was causing TC#105 and
TC#106 to fail.

  • Property mode set to 100755
Line 
1#!/usr/bin/env python
2#
3# This file is part of librdfa.
4#
5# librdfa is Free Software, and can be licensed under any of the
6# following three licenses:
7#
8#   1. GNU Lesser General Public License (LGPL) V2.1 or any
9#      newer version
10#   2. GNU General Public License (GPL) V2 or any newer version
11#   3. Apache License, V2.0 or any newer version
12#
13# You may not use this file except in compliance with at least one of
14# the above three licenses.
15#
16# See LICENSE-* at the top of this software distribution for more
17# information regarding the details of each license.
18#
19# Reads in an XHTML+RDFa file and outputs the triples generated by the file
20# in N3 format.
21import sys, os, urllib2
22sys.path += ("../python/dist",)
23import rdfa
24from StringIO import StringIO
25from rdflib.Graph import ConjunctiveGraph
26
27URL_TYPE_HTTP = 1
28URL_TYPE_FILE = 2
29
30##
31# Called whenever a triple is generated by the underlying implementation.
32#
33# @param rdf the rdf object to use when storing data.
34# @param subject the subject of the triple.
35# @param predicate the predicate for the triple.
36# @param obj the object of the triple.
37# @param objectType the type for the object in the triple.
38# @param dataType the dataType for the object in the triple.
39# @param language the language for the object in the triple.
40def handleTriple(rdf, subject, predicate, obj, objectType, dataType,
41                  language):
42   
43    if(objectType == rdfa.RDF_TYPE_NAMESPACE_PREFIX):
44        rdf['namespaces'][predicate] = obj
45    else:
46        rdf['triples'].append(
47            (subject, predicate, obj, objectType, dataType, language))
48       
49##
50# Called whenever the processing buffer for the C-side needs to be re-filled.
51#
52# @param dataFile the file-like object to use when reading in the data stream.
53# @param bufferSize the size of the buffer to return. Returning anything less
54#                   than bufferSize will halt execution after the returned
55#                   buffer has been processed.
56def fillBuffer(dataFile, bufferSize):
57    return dataFile.read()
58
59def objectToN3(obj, objectType, dataType, language):
60    rval = ""
61   
62    if(objectType in (rdfa.RDF_TYPE_PLAIN_LITERAL,
63                      rdfa.RDF_TYPE_TYPED_LITERAL,
64                      rdfa.RDF_TYPE_XML_LITERAL)):
65        rval += "\"%s\"" % \
66                (obj.replace("\"", "\\\"").replace("\n", "\\n"),)
67    elif(objectType == rdfa.RDF_TYPE_IRI):
68        rval += "<%s>" % (obj,)
69
70    if(language and (objectType == rdfa.RDF_TYPE_PLAIN_LITERAL)):
71        rval += "@%s" % (language,)
72
73    if(objectType == rdfa.RDF_TYPE_TYPED_LITERAL):
74        rval += "^^<%s>" % (dataType,)
75    elif(objectType == rdfa.RDF_TYPE_XML_LITERAL):
76        rval += "^^<http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral>"
77
78    return rval
79
80##
81# Converts a bnode to a N3 formatted string.
82#
83# @param obj the object of the triple.
84# @param triples the triple store.
85# @param processed all of the subjects that have already been processed.
86#
87# @return an N3 formatted string.
88def bnodeToN3(triples, processed, allTriples):
89    #print "bnodeToN3", triples
90    rval = "[ "
91
92    # Print all subjects with URIs first
93    previousTriple = False
94    for triple in triples:
95        subject = triple[0]
96        predicate = triple[1]
97        obj = triple[2]
98        objectType = triple[3]
99        dataType = triple[4]
100        language = triple[5]
101
102        #print "STO:", triple
103        if(previousTriple):
104            rval += "; "
105        else:
106            previousTriple = True
107
108        if(obj.startswith("_:") and obj not in processed):
109            objectTriples = getTriplesBySubject(obj, allTriples)
110            rval += "<%s> %s" % \
111                (predicate, bnodeToN3(objectTriples, processed, allTriples))
112        elif(subject not in processed):
113            rval += "<%s> %s" % \
114                (predicate, objectToN3(obj, objectType, dataType, language))
115   
116    rval += " ]"
117
118    if(len(triples) < 1):
119        rval = "[ ]"
120    else:
121        processed.append(triples[0][0])
122
123    return rval
124
125##
126# Converts a triple to a N3 formatted string.
127#
128# @param subject the subject of the triple.
129# @param predicate the predicate for the triple.
130# @param obj the object of the triple.
131# @param objectType the type for the object in the triple.
132# @param dataType the dataType for the object in the triple.
133# @param language the language for the object in the triple.
134# @param processed all of the bnodes that have already been processed.
135#
136# @return an N3 formatted string.
137def tripleToN3(triples, processed, allTriples):
138    rval = ""
139
140    for triple in triples:
141        subject = triple[0]
142        predicate = triple[1]
143        obj = triple[2]
144        objectType = triple[3]
145        dataType = triple[4]
146        language = triple[5]
147
148        if(not (obj.startswith("_:") and (obj in processed))):
149            rval += "<%s> <%s> " % (subject, predicate)
150
151            #print "PROCESSED:", processed
152
153            if(obj.startswith("_:")):
154                bnodeTriples = getTriplesBySubject(obj, allTriples)
155                rval += bnodeToN3(bnodeTriples, processed, allTriples)
156            else:
157                rval += objectToN3(obj, objectType, dataType, language)
158
159            rval += " .\n"
160
161    return rval
162
163##
164# Gets the non-bnode subjects that are in the triple store.
165#
166# @param triples the triple store.
167#
168# @return all of the non-bnode subjects in the triple store.
169def getNonBnodeSubjects(triples):
170    rval = {}
171   
172    for triple in triples:
173        subject = triple[0]
174        if(not subject.startswith("_:")):
175            rval[subject] = True
176
177    return rval.keys()
178
179##
180# Gets the bnode subjects that are in the triple store.
181#
182# @param triples the triple store.
183#
184# @return all of the bnode subjects in the triple store.
185def getBnodeSubjects(triples):
186    rval = {}
187   
188    for triple in triples:
189        subject = triple[0]
190        if(subject.startswith("_:")):
191            rval[subject] = True
192
193    rval = rval.keys()
194    rval.sort()
195
196    return rval
197
198##
199# Gets the triples by subject.
200#
201# @param subject The subject to use when retrieving the triples.
202#
203# @return A list of all triples that match a given subject.
204def getTriplesBySubject(subject, triples):
205    rval = []
206
207    for triple in triples:
208        if(triple[0] == subject):
209            rval.append(triple)
210
211    return rval
212
213##
214# Gets RDF/XML given an object with pre-defined namespaces and triples.
215#
216# @param rdf the rdf dictionary object that contains namespaces and triples.
217#
218# @return the RDF/XML text.
219def getRdfXml(rdf):
220    n3 = ""
221   
222    # Append the RDF namespace and print the prefix namespace mappings
223    rdf['namespaces']['xh1'] = "http://www.w3.org/1999/xhtml/vocab#"
224    rdf['namespaces']['rdf'] = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
225    for prefix, uri in rdf['namespaces'].items():
226        n3 += "@prefix %s: <%s> .\n" % (prefix, uri)
227       
228    # Print each subject-based triple to the screen
229    triples = rdf['triples']
230    processed = []
231
232    # Get all of the non-bnode subjects
233    nonBnodeSubjects = getNonBnodeSubjects(triples)
234
235    # Get all of the bnode subjects
236    bnodeSubjects = getBnodeSubjects(triples)
237
238    for subject in nonBnodeSubjects:
239        subjectTriples = getTriplesBySubject(subject, triples)
240        #print "PROCESSING NB SUBJECT:", subjectTriples
241
242        if(subject not in processed):
243            n3 += tripleToN3(subjectTriples, processed, triples)
244        processed.append(subject)
245
246    for subject in bnodeSubjects:
247        subjectTriples = getTriplesBySubject(subject, triples)
248        #print "PROCESSING BN SUBJECT:", subject
249        if(subject not in processed):
250            n3 += bnodeToN3(subjectTriples, processed, triples)
251            n3 += " .\n"
252
253    #print n3
254
255    g = ConjunctiveGraph()
256    g.parse(StringIO(n3), format="n3")
257    rdfxml = g.serialize()
258
259    return rdfxml
260
261##
262# The main entry point for the script.
263#
264# @param argv the argument list passed to the program.
265# @param stdout the standard output stream assigned to the program.
266# @param environ the execution environment for the program.
267def main(argv, stdout, environ):
268    urlType = URL_TYPE_FILE
269
270    if((len(argv) > 1) and (len(argv[1]) > 4)):
271        if(argv[1][:5] == "http:"):
272            urlType = URL_TYPE_HTTP
273    else:
274        print "usage:", argv[0], "<file>"
275        print "or"
276        print "      ", argv[0], "<URL>"
277        sys.exit(1)
278   
279    if((urlType == URL_TYPE_FILE) and (not os.path.exists(argv[1]))):
280        print "File %s, does not exist" % (argv[1])
281        sys.exit(1)
282    if((urlType == URL_TYPE_FILE) and (not os.access(argv[1], os.R_OK))):
283        print "Cannot read file named %s" % (argv[1])
284        sys.exit(1)
285
286    # Open the data file and setup the parser
287    dataFile = None
288    parser = None
289
290    # Open the proper file stream and initialize the parser using the URL
291    if(urlType == URL_TYPE_HTTP):
292        dataFile = urllib2.urlopen(argv[1])
293        parser = rdfa.RdfaParser(argv[1])
294    else:
295        dataFile = open(argv[1], "r")
296        parser = rdfa.RdfaParser("file://" + os.path.abspath(argv[1]))
297
298    # Create the RDF dictionary that will be used by the triple handler
299    # callback
300    rdf = {}
301    rdf['namespaces'] = {}
302    rdf['triples'] = []
303
304    # Setup the parser
305    parser.setTripleHandler(handleTriple, rdf)
306    parser.setBufferHandler(fillBuffer, dataFile)
307
308    # Parse the document
309    parser.parse()
310
311    # Close the datafile
312    dataFile.close()
313
314    # Print the RDF/XML to stdout
315    print getRdfXml(rdf)
316
317##
318# Run the rdfa2n3 python application.
319if __name__ == "__main__":
320    main(sys.argv, sys.stdout, os.environ)
Note: See TracBrowser for help on using the browser.