Java tutorial
/** * Copyright 2013 DuraSpace, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fcrepo.triplegenerators.tei; import static com.google.common.collect.Lists.newArrayList; import static com.hp.hpl.jena.graph.NodeFactory.createLiteral; import static com.hp.hpl.jena.graph.NodeFactory.createURI; import static com.hp.hpl.jena.rdf.model.ModelFactory.createDefaultModel; import static com.hp.hpl.jena.rdf.model.ModelFactory.createModelForGraph; import static java.lang.String.format; import static javax.xml.transform.TransformerFactory.newInstance; import static org.modeshape.jcr.api.JcrConstants.JCR_CONTENT; import static org.modeshape.jcr.api.JcrConstants.JCR_DATA; import static org.slf4j.LoggerFactory.getLogger; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import javax.jcr.RepositoryException; import javax.xml.transform.Result; import javax.xml.transform.Source; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.TransformerFactoryConfigurationError; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.Extractor; import org.apache.any23.extractor.IssueReport.Issue; import org.apache.any23.source.ByteArrayDocumentSource; import org.apache.any23.source.DocumentSource; import org.apache.any23.writer.TripleHandlerException; import org.apache.any23.Any23; import org.apache.any23.ExtractionReport; import org.fcrepo.rdf.GraphProperties; import org.fcrepo.rdf.GraphSubjects; import org.fcrepo.triplegenerators.tei.xslt.LoggingErrorListener; import org.slf4j.Logger; import com.google.common.io.FileBackedOutputStream; import com.hp.hpl.jena.graph.Graph; import com.hp.hpl.jena.graph.Node; import com.hp.hpl.jena.graph.Triple; import com.hp.hpl.jena.mem.GraphMem; import com.hp.hpl.jena.query.Dataset; import com.hp.hpl.jena.sparql.core.DatasetImpl; /** * @author ajs6f * @date Jul 10, 2013 */ public class TeiTripleGenerator implements GraphProperties { private static final String MODEL_NAME = "tei"; private static Transformer addIdsXform; private static Transformer tei2RdfXform; private static Any23 any23 = new Any23(); private static final Logger LOGGER = getLogger(TeiTripleGenerator.class); private static final Node PROBLEM_PREDICATE = createURI("info:fedora/hasProblemWithTeiRdfExtraction"); /** * @throws TransformerConfigurationException * @throws TransformerFactoryConfigurationError * @throws IOException */ public TeiTripleGenerator() throws TransformerConfigurationException, TransformerFactoryConfigurationError, IOException { // initialize XSLT final TransformerFactory tf = newInstance("net.sf.saxon.TransformerFactoryImpl", null); tf.setErrorListener(new LoggingErrorListener()); try (final InputStream sourceStream = this.getClass().getResourceAsStream("/xslt/add-ids.xslt")) { addIdsXform = tf.newTransformer(new StreamSource(sourceStream)); } try (final InputStream sourceStream = this.getClass().getResourceAsStream("/xslt/tei2rdf.xslt")) { tei2RdfXform = tf.newTransformer(new StreamSource(sourceStream)); } } /** * @param rdfXml * @param baseUri * @return A {@link Dataset} with extracted triples. * @throws TripleHandlerException * @throws IOException * @throws ExtractionException */ protected Dataset extractTriples(final byte[] rdfXml, final String baseUri) throws TripleHandlerException, IOException, ExtractionException { final DocumentSource source = new ByteArrayDocumentSource(rdfXml, baseUri, "application/rdf+xml"); final Graph problems = new GraphMem(); try (final ModelTripleHandler handler = new ModelTripleHandler()) { final ExtractionReport report = any23.extract(source, handler); final Dataset results = new DatasetImpl(createDefaultModel()); results.addNamedModel(MODEL_NAME, handler.getModel()); for (final Extractor<?> extractor : report.getMatchingExtractors()) { for (final Issue issue : report.getExtractorIssues(extractor.getDescription().getExtractorName())) { final String mesg = format("Extraction issue: ({},{}): {}\n", issue.getCol(), issue.getRow(), issue.getMessage()); problems.add(new Triple(createURI(baseUri), PROBLEM_PREDICATE, createLiteral(mesg))); } } if (problems.size() > 0) { results.addNamedModel(PROBLEMS_MODEL_NAME, createModelForGraph(problems)); } return results; } } /** * @param resource An {@link InputStream} with TEI XML. * @return A {@code byte[]} of RDF/XML. * @throws IOException * @throws TransformerException */ private byte[] createRDFXML(final InputStream resource) throws IOException, TransformerException { final Source resourceSource = new StreamSource(resource); try (final FileBackedOutputStream addIdsResultStream = new FileBackedOutputStream(1024 * 1024)) { final Result addIdsResult = new StreamResult(addIdsResultStream); addIdsXform.transform(resourceSource, addIdsResult); LOGGER.debug("Added XML IDs to TEI."); try (final InputStream tei2RdfSourceStream = addIdsResultStream.getSupplier().getInput()) { final Source tei2RdfSource = new StreamSource(tei2RdfSourceStream); final StreamResult tei2RdfResult = new StreamResult(new StringWriter()); tei2RdfXform.transform(tei2RdfSource, tei2RdfResult); LOGGER.debug("Created RDF/XML from TEI: \n{}", tei2RdfResult.getWriter().toString()); return tei2RdfResult.getWriter().toString().getBytes(); } } } /** * @param baseUri * @param e * @return A {@link Dataset} of RDF containing the exception */ protected Dataset exceptionRdf(final String baseUri, final Exception... es) { final Graph problems = new GraphMem(); final Dataset sadResults = new DatasetImpl(createDefaultModel()); for (final Exception e : newArrayList(es)) { problems.add(new Triple(createURI(baseUri), PROBLEM_PREDICATE, createLiteral(e.getMessage()))); } sadResults.addNamedModel(PROBLEMS_MODEL_NAME, createModelForGraph(problems)); return sadResults; } @Override public String getPropertyModelName() { return MODEL_NAME; } @Override public Dataset getProperties(final javax.jcr.Node node, final GraphSubjects subjects, final long offset, final int limit) throws RepositoryException { return getProperties(node, subjects); } @Override public Dataset getProperties(final javax.jcr.Node node, final GraphSubjects subjects) { String baseUri = "unknown"; byte[] rdfXml; try { baseUri = subjects.getGraphSubject(node).getURI(); try (final InputStream resource = node.getNode(JCR_CONTENT).getProperty(JCR_DATA).getBinary() .getStream()) { rdfXml = createRDFXML(resource); } // TODO when Any23 supports it, use a streaming transfer between // these two steps return extractTriples(rdfXml, baseUri); } catch (TripleHandlerException | IOException | TransformerException | ExtractionException | RepositoryException e) { return exceptionRdf(baseUri, e); } } }