Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.any23.extractor.html; import org.apache.any23.extractor.ExtractionContext; import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.ExtractionParameters; import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.Extractor; import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.extractor.rdf.JSONLDExtractor; import org.apache.any23.extractor.rdf.JSONLDExtractorFactory; import org.apache.any23.rdf.RDFUtils; import org.apache.any23.vocab.SINDICE; import org.apache.commons.io.IOUtils; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; /** * This extractor represents the HTML script tags used to embed blocks of data * in documents. This way, JSON-LD content can be easily embedded in HTML by * placing it in a script element with the type attribute set to * application/ld+json according the <a * href="http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents" * >JSON-LD specification</a>. * */ public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor { private static final SINDICE vSINDICE = SINDICE.getInstance(); private IRI profile; private Map<String, IRI> prefixes = new HashMap<>(); private String documentLang; private JSONLDExtractor extractor; /** * {@inheritDoc} */ @Override public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in, ExtractionResult out) throws IOException, ExtractionException { profile = extractProfile(in); documentLang = getDocumentLanguage(in); extractLinkDefinedPrefixes(in); String baseProfile = vSINDICE.NS; if (profile != null) { baseProfile = profile.toString(); } extractionContext.getDocumentIRI(); Set<JSONLDScript> jsonldScripts = extractJSONLDScript(in, baseProfile, extractionParameters, extractionContext, out); for (JSONLDScript jsonldScript : jsonldScripts) { //String lang = documentLang; //if (jsonldScript.getLang() != null) { // lang = jsonldScript.getLang(); //} //out.writeTriple(documentIRI, jsonldScript.getName(), // SimpleValueFactory.getInstance().createLiteral(jsonldScript.getContent(), lang)); } } /** * Returns the {@link Document} language if declared, <code>null</code> * otherwise. * * @param in * a instance of {@link Document}. * @return the language declared, could be <code>null</code>. */ private String getDocumentLanguage(Document in) { String lang = DomUtils.find(in, "string(/HTML/@lang)"); if ("".equals(lang)) { return null; } return lang; } private IRI extractProfile(Document in) { String profile = DomUtils.find(in, "string(/HTML/@profile)"); if ("".equals(profile)) { return null; } return SimpleValueFactory.getInstance().createIRI(profile); } /** * It extracts prefixes defined in the <i>LINK</i> meta tags. * * @param in */ private void extractLinkDefinedPrefixes(Document in) { List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK"); for (Node linkNode : linkNodes) { NamedNodeMap attributes = linkNode.getAttributes(); Node relNode = attributes.getNamedItem("rel"); String rel = relNode == null ? null : relNode.getTextContent(); Node hrefNode = attributes.getNamedItem("href"); String href = hrefNode == null ? null : hrefNode.getTextContent(); if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) { prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href)); } } } private Set<JSONLDScript> extractJSONLDScript(Document in, String baseProfile, ExtractionParameters extractionParameters, ExtractionContext extractionContext, ExtractionResult out) throws IOException, ExtractionException { List<Node> scriptNodes = DomUtils.findAll(in, "//SCRIPT"); Set<JSONLDScript> result = new HashSet<>(); extractor = new JSONLDExtractorFactory().createExtractor(); for (Node jsonldNode : scriptNodes) { NamedNodeMap attributes = jsonldNode.getAttributes(); for (int i = 0; i < attributes.getLength(); i++) { if ("application/ld+json".equalsIgnoreCase(attributes.item(i).getTextContent())) { extractor.run(extractionParameters, extractionContext, IOUtils.toInputStream(jsonldNode.getTextContent(), StandardCharsets.UTF_8), out); } } Node nameAttribute = attributes.getNamedItem("name"); Node contentAttribute = attributes.getNamedItem("content"); if (nameAttribute == null || contentAttribute == null) { continue; } String name = nameAttribute.getTextContent(); String content = contentAttribute.getTextContent(); String xpath = DomUtils.getXPathForNode(jsonldNode); IRI nameAsIRI = getPrefixIfExists(name); if (nameAsIRI == null) { nameAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + name); } JSONLDScript jsonldScript = new JSONLDScript(xpath, nameAsIRI, content); result.add(jsonldScript); } return result; } private IRI getPrefixIfExists(String name) { String[] split = name.split("\\."); if (split.length == 2 && prefixes.containsKey(split[0])) { return SimpleValueFactory.getInstance().createIRI(prefixes.get(split[0]) + split[1]); } return null; } @Override public ExtractorDescription getDescription() { return EmbeddedJSONLDExtractorFactory.getDescriptionInstance(); } private class JSONLDScript { private String xpath; public JSONLDScript(String xpath, IRI name, String content) { this.xpath = xpath; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null) { return false; } if (!(o instanceof JSONLDScript)) { return false; } JSONLDScript meta = (JSONLDScript) o; if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) { return false; } return true; } @Override public int hashCode() { return xpath != null ? xpath.hashCode() : 0; } } }