Java tutorial
/** * Copyright 2010 Bernhard Haslhofer * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.xbib.elasticsearch.index.analysis.skos.engine.jena; import com.hp.hpl.jena.ontology.AnnotationProperty; import com.hp.hpl.jena.ontology.ObjectProperty; import com.hp.hpl.jena.rdf.model.Literal; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.RDFNode; import com.hp.hpl.jena.rdf.model.ResIterator; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.StmtIterator; import com.hp.hpl.jena.update.GraphStore; import com.hp.hpl.jena.update.GraphStoreFactory; import com.hp.hpl.jena.update.UpdateAction; import com.hp.hpl.jena.update.UpdateFactory; import com.hp.hpl.jena.update.UpdateRequest; import com.hp.hpl.jena.util.FileManager; import com.hp.hpl.jena.vocabulary.RDF; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.TreeSet; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.SimpleAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.Collector; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; import org.xbib.elasticsearch.index.analysis.skos.engine.SKOSEngine; import org.xbib.elasticsearch.plugin.analysis.SKOSAnalysisPlugin; /** * A Lucene-backed SKOSEngine Implementation. * * Each SKOS concept is stored/indexed as a Lucene document. * * All labels are converted to lowercase */ public class SKOSEngineImpl implements SKOSEngine { /** * Records the total number of matches */ public static class AllDocCollector extends Collector { private final List<Integer> docs = new ArrayList<Integer>(); private int base; @Override public boolean acceptsDocsOutOfOrder() { return true; } @Override public void collect(int doc) throws IOException { docs.add(doc + base); } public List<Integer> getDocs() { return docs; } @Override public void setNextReader(AtomicReaderContext context) throws IOException { base = context.docBase; } @Override public void setScorer(Scorer scorer) throws IOException { // not needed } } //protected final Version matchVersion; /* * Static fields used in the Lucene Index */ private static final String FIELD_URI = "uri"; private static final String FIELD_PREF_LABEL = "pref"; private static final String FIELD_ALT_LABEL = "alt"; private static final String FIELD_HIDDEN_LABEL = "hidden"; private static final String FIELD_BROADER = "broader"; private static final String FIELD_NARROWER = "narrower"; private static final String FIELD_BROADER_TRANSITIVE = "broaderTransitive"; private static final String FIELD_NARROWER_TRANSITIVE = "narrowerTransitive"; private static final String FIELD_RELATED = "related"; /** * The input SKOS model */ private Model skosModel; /** * The location of the concept index */ private Directory indexDir; /** * Provides access to the index */ private IndexSearcher searcher; /** * The languages to be considered when returning labels. * * If NULL, all languages are supported */ private Set<String> languages; /** * The analyzer used during indexing of / querying for concepts * * SimpleAnalyzer = LetterTokenizer + LowerCaseFilter */ private final Analyzer analyzer; /** * This constructor loads the SKOS model from a given InputStream using the * given serialization language parameter, which must be either N3, RDF/XML, * or TURTLE. * * @param inputStream the input stream * @param lang the serialization language * @throws IOException if the model cannot be loaded */ public SKOSEngineImpl(InputStream inputStream, String lang) throws IOException { if (!("N3".equals(lang) || "RDF/XML".equals(lang) || "TURTLE".equals(lang))) { throw new IOException("Invalid RDF serialization format"); } analyzer = new SimpleAnalyzer(SKOSAnalysisPlugin.getLuceneVersion()); skosModel = ModelFactory.createDefaultModel(); skosModel.read(inputStream, null, lang); indexDir = new RAMDirectory(); entailSKOSModel(); indexSKOSModel(); searcher = new IndexSearcher(DirectoryReader.open(indexDir)); } /** * This constructor loads the SKOS model from a given filename or URI, * starts the indexing process and sets up the index searcher. * * @param languages the languages to be considered * @param indexPath index path * @param filenameOrURI file name or URI * @throws IOException */ public SKOSEngineImpl(String indexPath, String filenameOrURI, String... languages) throws IOException { analyzer = new SimpleAnalyzer(SKOSAnalysisPlugin.getLuceneVersion()); String langSig = ""; if (languages != null) { this.languages = new TreeSet<String>(Arrays.asList(languages)); langSig = "-" + join(this.languages.iterator(), '.'); } String name = getName(filenameOrURI); File dir = new File(indexPath + name + langSig); indexDir = FSDirectory.open(dir); if (!dir.isDirectory()) { // load the skos model from the given file FileManager fileManager = new FileManager(); fileManager.addLocatorFile(); fileManager.addLocatorURL(); fileManager.addLocatorClassLoader(SKOSEngineImpl.class.getClassLoader()); if (getExtension(filenameOrURI).equals("zip")) { fileManager.addLocatorZip(filenameOrURI); filenameOrURI = getBaseName(filenameOrURI); } skosModel = fileManager.loadModel(filenameOrURI); entailSKOSModel(); indexSKOSModel(); } searcher = new IndexSearcher(DirectoryReader.open(indexDir)); } /** * This constructor loads the SKOS model from a given InputStream using the * given serialization language parameter, which must be either N3, RDF/XML, * or TURTLE. * * @param inputStream the input stream * @param format the serialization language * @throws IOException if the model cannot be loaded */ public SKOSEngineImpl(InputStream inputStream, String format, String... languages) throws IOException { if (!("N3".equals(format) || "RDF/XML".equals(format) || "TURTLE".equals(format))) { throw new IOException("Invalid RDF serialization format"); } if (languages != null) { this.languages = new TreeSet<String>(Arrays.asList(languages)); } analyzer = new SimpleAnalyzer(SKOSAnalysisPlugin.getLuceneVersion()); skosModel = ModelFactory.createDefaultModel(); skosModel.read(inputStream, null, format); indexDir = new RAMDirectory(); entailSKOSModel(); indexSKOSModel(); searcher = new IndexSearcher(DirectoryReader.open(indexDir)); } private void entailSKOSModel() { GraphStore graphStore = GraphStoreFactory.create(skosModel); String sparqlQuery = "PREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n" + "PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n" + "INSERT { ?subject rdf:type skos:Concept }\n" + "WHERE {\n" + "{ ?subject skos:prefLabel ?text } UNION\n" + "{ ?subject skos:altLabel ?text } UNION\n" + "{ ?subject skos:hiddenLabel ?text }\n" + "}"; UpdateRequest request = UpdateFactory.create(sparqlQuery); UpdateAction.execute(request, graphStore); } /** * Creates lucene documents from SKOS concept. In order to allow language * restrictions, one document per language is created. */ private Document createDocumentsFromConcept(Resource skos_concept) { Document conceptDoc = new Document(); String conceptURI = skos_concept.getURI(); Field uriField = new Field(FIELD_URI, conceptURI, StringField.TYPE_STORED); conceptDoc.add(uriField); // store the preferred lexical labels indexAnnotation(skos_concept, conceptDoc, SKOS.prefLabel, FIELD_PREF_LABEL); // store the alternative lexical labels indexAnnotation(skos_concept, conceptDoc, SKOS.altLabel, FIELD_ALT_LABEL); // store the hidden lexical labels indexAnnotation(skos_concept, conceptDoc, SKOS.hiddenLabel, FIELD_HIDDEN_LABEL); // store the URIs of the broader concepts indexObject(skos_concept, conceptDoc, SKOS.broader, FIELD_BROADER); // store the URIs of the broader transitive concepts indexObject(skos_concept, conceptDoc, SKOS.broaderTransitive, FIELD_BROADER_TRANSITIVE); // store the URIs of the narrower concepts indexObject(skos_concept, conceptDoc, SKOS.narrower, FIELD_NARROWER); // store the URIs of the narrower transitive concepts indexObject(skos_concept, conceptDoc, SKOS.narrowerTransitive, FIELD_NARROWER_TRANSITIVE); // store the URIs of the related concepts indexObject(skos_concept, conceptDoc, SKOS.related, FIELD_RELATED); return conceptDoc; } @Override public String[] getAltLabels(String conceptURI) throws IOException { return readConceptFieldValues(conceptURI, FIELD_ALT_LABEL); } @Override public String[] getAltTerms(String label) throws IOException { List<String> result = new ArrayList<String>(); // convert the query to lower-case String queryString = label.toLowerCase(); try { String[] conceptURIs = getConcepts(queryString); for (String conceptURI : conceptURIs) { String[] altLabels = getAltLabels(conceptURI); if (altLabels != null) { result.addAll(Arrays.asList(altLabels)); } } } catch (Exception e) { System.err.println("Error when accessing SKOS Engine.\n" + e.getMessage()); } return result.toArray(new String[result.size()]); } @Override public String[] getHiddenLabels(String conceptURI) throws IOException { return readConceptFieldValues(conceptURI, FIELD_HIDDEN_LABEL); } @Override public String[] getBroaderConcepts(String conceptURI) throws IOException { return readConceptFieldValues(conceptURI, FIELD_BROADER); } @Override public String[] getBroaderLabels(String conceptURI) throws IOException { return getLabels(conceptURI, FIELD_BROADER); } @Override public String[] getBroaderTransitiveConcepts(String conceptURI) throws IOException { return readConceptFieldValues(conceptURI, FIELD_BROADER_TRANSITIVE); } @Override public String[] getBroaderTransitiveLabels(String conceptURI) throws IOException { return getLabels(conceptURI, FIELD_BROADER_TRANSITIVE); } @Override public String[] getConcepts(String label) throws IOException { List<String> concepts = new ArrayList<String>(); // convert the query to lower-case String queryString = label.toLowerCase(); AllDocCollector collector = new AllDocCollector(); DisjunctionMaxQuery query = new DisjunctionMaxQuery(0.0f); query.add(new TermQuery(new Term(FIELD_PREF_LABEL, queryString))); query.add(new TermQuery(new Term(FIELD_ALT_LABEL, queryString))); query.add(new TermQuery(new Term(FIELD_HIDDEN_LABEL, queryString))); searcher.search(query, collector); for (Integer hit : collector.getDocs()) { Document doc = searcher.doc(hit); String conceptURI = doc.getValues(FIELD_URI)[0]; concepts.add(conceptURI); } return concepts.toArray(new String[concepts.size()]); } private String[] getLabels(String conceptURI, String field) throws IOException { List<String> labels = new ArrayList<String>(); String[] concepts = readConceptFieldValues(conceptURI, field); for (String aConceptURI : concepts) { String[] prefLabels = getPrefLabels(aConceptURI); labels.addAll(Arrays.asList(prefLabels)); String[] altLabels = getAltLabels(aConceptURI); labels.addAll(Arrays.asList(altLabels)); } return labels.toArray(new String[labels.size()]); } @Override public String[] getNarrowerConcepts(String conceptURI) throws IOException { return readConceptFieldValues(conceptURI, FIELD_NARROWER); } @Override public String[] getNarrowerLabels(String conceptURI) throws IOException { return getLabels(conceptURI, FIELD_NARROWER); } @Override public String[] getNarrowerTransitiveConcepts(String conceptURI) throws IOException { return readConceptFieldValues(conceptURI, FIELD_NARROWER_TRANSITIVE); } @Override public String[] getNarrowerTransitiveLabels(String conceptURI) throws IOException { return getLabels(conceptURI, FIELD_NARROWER_TRANSITIVE); } @Override public String[] getPrefLabels(String conceptURI) throws IOException { return readConceptFieldValues(conceptURI, FIELD_PREF_LABEL); } @Override public String[] getRelatedConcepts(String conceptURI) throws IOException { return readConceptFieldValues(conceptURI, FIELD_RELATED); } @Override public String[] getRelatedLabels(String conceptURI) throws IOException { return getLabels(conceptURI, FIELD_RELATED); } private void indexAnnotation(Resource skos_concept, Document conceptDoc, AnnotationProperty property, String field) { StmtIterator stmt_iter = skos_concept.listProperties(property); while (stmt_iter.hasNext()) { Literal labelLiteral = stmt_iter.nextStatement().getObject().as(Literal.class); String label = labelLiteral.getLexicalForm(); String labelLang = labelLiteral.getLanguage(); if (this.languages != null && !this.languages.contains(labelLang)) { continue; } // converting label to lower-case label = label.toLowerCase(); Field labelField = new Field(field, label, StringField.TYPE_STORED); conceptDoc.add(labelField); } } private void indexObject(Resource skos_concept, Document conceptDoc, ObjectProperty property, String field) { StmtIterator stmt_iter = skos_concept.listProperties(property); while (stmt_iter.hasNext()) { RDFNode concept = stmt_iter.nextStatement().getObject(); if (!concept.canAs(Resource.class)) { System.err.println("Error when indexing relationship of concept " + skos_concept.getURI() + " ."); continue; } Resource resource = concept.as(Resource.class); Field conceptField = new Field(field, resource.getURI(), TextField.TYPE_STORED); conceptDoc.add(conceptField); } } /** * Creates the synonym index * * @throws IOException */ private void indexSKOSModel() throws IOException { IndexWriterConfig cfg = new IndexWriterConfig(SKOSAnalysisPlugin.getLuceneVersion(), analyzer); IndexWriter writer = new IndexWriter(indexDir, cfg); writer.getConfig().setRAMBufferSizeMB(48); /* iterate SKOS concepts, create Lucene docs and add them to the index */ ResIterator concept_iter = skosModel.listResourcesWithProperty(RDF.type, SKOS.Concept); while (concept_iter.hasNext()) { Resource skos_concept = concept_iter.next(); Document concept_doc = createDocumentsFromConcept(skos_concept); writer.addDocument(concept_doc); } writer.close(); } /** * Returns the values of a given field for a given concept */ private String[] readConceptFieldValues(String conceptURI, String field) throws IOException { Query query = new TermQuery(new Term(FIELD_URI, conceptURI)); TopDocs docs = searcher.search(query, 1); ScoreDoc[] results = docs.scoreDocs; if (results.length != 1) { System.out.println("Unknown concept " + conceptURI); return null; } Document conceptDoc = searcher.doc(results[0].doc); return conceptDoc.getValues(field); } private String join(Iterator iterator, char separator) { // handle null, zero and one elements before building a buffer if (iterator == null) { return null; } if (!iterator.hasNext()) { return ""; } Object first = iterator.next(); if (!iterator.hasNext()) { return first == null ? "" : first.toString(); } // two or more elements StringBuilder buf = new StringBuilder(); if (first != null) { buf.append(first); } while (iterator.hasNext()) { buf.append(separator); Object obj = iterator.next(); if (obj != null) { buf.append(obj); } } return buf.toString(); } private String getName(String filename) { if (filename == null) { return null; } int index = indexOfLastSeparator(filename); return filename.substring(index + 1); } private static final char UNIX_SEPARATOR = '/'; private static final char WINDOWS_SEPARATOR = '\\'; private int indexOfLastSeparator(String filename) { if (filename == null) { return -1; } int lastUnixPos = filename.lastIndexOf(UNIX_SEPARATOR); int lastWindowsPos = filename.lastIndexOf(WINDOWS_SEPARATOR); return Math.max(lastUnixPos, lastWindowsPos); } private String getExtension(String filename) { if (filename == null) { return null; } int index = indexOfExtension(filename); if (index == -1) { return ""; } else { return filename.substring(index + 1); } } private int indexOfExtension(String filename) { if (filename == null) { return -1; } int extensionPos = filename.lastIndexOf(EXTENSION_SEPARATOR); int lastSeparator = indexOfLastSeparator(filename); return (lastSeparator > extensionPos ? -1 : extensionPos); } public static final char EXTENSION_SEPARATOR = '.'; private String getBaseName(String filename) { return removeExtension(getName(filename)); } private String removeExtension(String filename) { if (filename == null) { return null; } int index = indexOfExtension(filename); if (index == -1) { return filename; } else { return filename.substring(0, index); } } }