Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package di.uniba.it.nlpita.index; import edu.ucla.cs.scai.canali.core.index.tokens.ClassToken; import edu.ucla.cs.scai.canali.core.index.tokens.EntityToken; import edu.ucla.cs.scai.canali.core.index.tokens.IndexedToken; import static edu.ucla.cs.scai.canali.core.index.tokens.LiteralToken.BOOLEAN; import static edu.ucla.cs.scai.canali.core.index.tokens.LiteralToken.DATE; import static edu.ucla.cs.scai.canali.core.index.tokens.LiteralToken.DOUBLE; import static edu.ucla.cs.scai.canali.core.index.tokens.LiteralToken.STRING; import edu.ucla.cs.scai.canali.core.index.tokens.OntologyElementToken; import edu.ucla.cs.scai.canali.core.index.tokens.PropertyToken; import edu.ucla.cs.scai.canali.core.index.utils.Trie; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; import java.io.ObjectOutputStream; import java.io.PrintWriter; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.StringTokenizer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.FSDirectory; /** * * @author lucia */ public class BuildSeoDwarfIndex { /** * * * This class contains the function for creating a Lucene directory, that * can be used by the AQUA system, starting from an ontology stored in the * following text files (the format for each file is specified in the method * using it) * * triples: contain the triples subject, property, value * * property_labels: each row contains and property URI and a label - the * same property URI can appear on multiple rows (multiple labels) * * class_labels: each row contains a class URI and a label - the same class * URI can appear on multiple rows (multiple labels) * * class_parents each row contains a class URI and the URI of one of its * class parents * * entity_labels: each row contains an entity URI and a label - the same * entity URI can appear on multiple rows (multiple labels) * * entity_classes: each row contains an entity URI and the URI of one of its * classes * * basic_types_literal_types: each row contains a basic type URI and either * Double, Date, String, or Boolean * * additional_property_labels: other property labels * * additional_class_labels: other class labels * * additional_entity_labels: other class labels */ public final static String THING = "http://www.w3.org/2002/07/owl#Thing"; private int thingId; private String basePathInput; private String basePathOutput; private HashMap<String, int[]> literalTriplesSubjects = new HashMap<>(); private HashMap<String, int[]> literalTriplesProperties = new HashMap<>(); private HashMap<String, HashSet<Integer>> literalTypesInProperties = new HashMap<>(); private HashSet<String> literalTypes = new HashSet<>(); private HashMap<String, Integer> iLiteralTriples = new HashMap<>(); private HashMap<String, String> basicTypesMapping = new HashMap<>(); private HashMap<String, Integer> entityIdFromUriWithPrefix = new HashMap<>(); private HashMap<String, Integer> propertyIdFromUri = new HashMap<>(); private int[] entityTriplesSubjects; private int[] entityTriplesProperties; private int[] entityTriplesValues; private int iEntityTriples = 0; private String[] entityUriWithPrefix; private String[] propertyUri; private HashSet<String>[] propertyLabels; private HashSet<String>[] entityLabels; private HashSet<String>[] classLabels; private HashSet<Integer>[] entityClasses; private HashSet<Integer>[] entityOutProperties; private HashSet<Integer>[] entityInProperties; private HashSet<Integer>[] classOutProperties; private HashSet<Integer>[] classInProperties; private HashSet<Integer>[] propertyInProperties; private HashSet<Integer>[] propertyOutProperties; private HashMap<String, Integer> classIdFromUri = new HashMap<>(); private String[] classUri; private HashSet<Integer>[] classParents; private HashSet<Integer>[] classAncestors; private HashSet<Integer>[] classChildren; private HashSet<Integer>[] classDescendants; private int[] propertyCount; private boolean[] propertyHasLiteralRange; boolean printFiles = true; public BuildSeoDwarfIndex(String basePathInput, String basePathOutput) { if (!basePathInput.endsWith(File.separator)) { basePathInput += File.separator; } this.basePathInput = basePathInput; if (!basePathOutput.endsWith(File.separator)) { basePathOutput += File.separator; } this.basePathOutput = basePathOutput; literalTypesInProperties.put(DOUBLE, new HashSet<Integer>()); literalTypesInProperties.put(STRING, new HashSet<Integer>()); literalTypesInProperties.put(DATE, new HashSet<Integer>()); literalTypesInProperties.put(BOOLEAN, new HashSet<Integer>()); /* * Non si capisce ancora a cosa serva perch si potrebeb sostituire con * literalTypesInProperties.keysSet() Per ora lo manteniamo per coerenza * con il codice di Mazzeo */ literalTypes.add(STRING); literalTypes.add(BOOLEAN); literalTypes.add(DOUBLE); literalTypes.add(DATE); /* * Probabilimente serve per contare il numero di triple che contengono * letterale come oggetti */ for (String type : literalTypes) { iLiteralTriples.put(type, 0); } } private Integer getEntityIdFromUri(String uri) { return entityIdFromUriWithPrefix.get(uri); } private void putEntityIdFromUri(String uri, int id) { entityIdFromUriWithPrefix.put(uri, id); } private void loadBasicTypesMapping() throws Exception { System.out.println("Loading basic types mappings"); try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + "basic_types_literal_types"))) { String l = in.readLine(); while (l != null) { StringTokenizer st = new StringTokenizer(l, "\t<>"); String uri = st.nextToken(); String literal = st.nextToken(); // System.out.println(uri + "\t" + literal); basicTypesMapping.put(uri, literal); l = in.readLine(); } } } private void updateTriples(String subj, String attr, String entityVal, String literalType) { Integer idSbj = getEntityIdFromUri(subj);//entityIdFromUri.get(subj); Integer idAttr = propertyIdFromUri.get(attr); if (entityVal != null) { Integer idVal = getEntityIdFromUri(entityVal);//entityIdFromUri.get(entityVal); entityTriplesSubjects[iEntityTriples] = idSbj; entityTriplesProperties[iEntityTriples] = idAttr; entityTriplesValues[iEntityTriples] = idVal; iEntityTriples++; //now, create the inverted triple Integer idInvAttr = propertyIdFromUri.get(attr + "Inv"); entityTriplesSubjects[iEntityTriples] = idVal; entityTriplesProperties[iEntityTriples] = idInvAttr; entityTriplesValues[iEntityTriples] = idSbj; iEntityTriples++; } else { int pos = iLiteralTriples.get(literalType); literalTriplesSubjects.get(literalType)[pos] = idSbj; literalTriplesProperties.get(literalType)[pos] = idAttr; iLiteralTriples.put(literalType, pos + 1); } } private void loadTriples() throws Exception { System.out.println("Mapping entities and property URIs to ids"); int nEntityTriples = 0; HashMap<String, Integer> nLiteralTriples = new HashMap<>(); for (String type : literalTypes) { nLiteralTriples.put(type, 0); } HashSet<String> unrecognizedBasicTypes = new HashSet<>(); //count entity-valued and literal-valued triples //and //create the association between uris and ids for entities try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + "triples"))) { String l = in.readLine(); int n = 0; while (l != null && l.length() > 0) { //StringTokenizer st = new StringTokenizer(l, "<> \t"); StringTokenizer st = new StringTokenizer(l, "\t"); String subject = st.nextToken(); System.out.println("SUBJECT: " + subject); String property = st.nextToken(); System.out.println("PROPERTY: " + property); String value = st.nextToken(); System.out.println("VALUE: " + value); // if (subject.startsWith("http") && property.startsWith("http")) { Integer idSbj = getEntityIdFromUri(subject); //entityIdFromUri.get(subject); if (idSbj == null) { idSbj = entityIdFromUriWithPrefix.size() + 1;//entityIdFromUri.size() + 1; putEntityIdFromUri(subject, idSbj); //entityIdFromUri.put(subject, idSbj); } Integer idAttr = propertyIdFromUri.get(property); if (idAttr == null) { idAttr = propertyIdFromUri.size() + 1; propertyIdFromUri.put(property, idAttr); } if (value.startsWith("http") || value.startsWith("ftp:")) { //it is an entity Integer idVal = getEntityIdFromUri(value); //entityIdFromUri.get(value); if (idVal == null) { idVal = entityIdFromUriWithPrefix.size() + 1;//entityIdFromUri.size() + 1; putEntityIdFromUri(value, idVal);//entityIdFromUri.put(value, idVal); } Integer idInvAttr = propertyIdFromUri.get(property + "Inv"); if (idInvAttr == null) { idInvAttr = propertyIdFromUri.size() + 1; propertyIdFromUri.put(property + "Inv", idInvAttr); } nEntityTriples += 2; } else //it is a literal if (value.endsWith("^^")) { //it is a basic type String type = org.apache.commons.lang3.StringEscapeUtils.unescapeJava(st.nextToken()); String literalType = basicTypesMapping.get(type); if (literalType != null) { nLiteralTriples.put(literalType, nLiteralTriples.get(literalType) + 1); } else if (!unrecognizedBasicTypes.contains(type)) { System.out.println("Unrecognized type: " + type); System.out.println("in line: " + l); unrecognizedBasicTypes.add(type); } } else if (value.startsWith("\"")) { //it is a String nLiteralTriples.put(STRING, nLiteralTriples.get(STRING) + 1); } else { //it is a blanknode } n++; if (n % 1000000 == 0) { System.out.println("Loaded " + (n / 1000000) + "M triples"); } // } else { // System.out.println("Invalid triple: " + l); // } l = in.readLine(); } } System.out.println("Number of triples with entity value: " + nEntityTriples); for (String type : literalTypes) { System.out.println("Number of triples with " + type + " value: " + nLiteralTriples.get(type)); } entityTriplesSubjects = new int[nEntityTriples]; entityTriplesProperties = new int[nEntityTriples]; entityTriplesValues = new int[nEntityTriples]; for (String type : literalTypes) { literalTriplesSubjects.put(type, new int[nLiteralTriples.get(type)]); literalTriplesProperties.put(type, new int[nLiteralTriples.get(type)]); } //load the triples into the arrays creaded above System.out.println("Loading triples"); try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + "triples"))) { String l = in.readLine(); int n = 0; while (l != null && l.length() > 0) { StringTokenizer st = new StringTokenizer(l, "<> \t"); String sbj = st.nextToken(); String attr = st.nextToken(); String val = st.nextToken(); if (sbj.startsWith("http") && attr.startsWith("http")) { if (val.startsWith("http") || val.startsWith("ftp:")) { //it is an entity updateTriples(sbj, attr, val, null); } else //it is a literal if (val.endsWith("^^")) { //it is a basic type String type = org.apache.commons.lang3.StringEscapeUtils.unescapeJava(st.nextToken()); String literalType = basicTypesMapping.get(type); if (literalType != null) { updateTriples(sbj, attr, null, literalType); } else if (!unrecognizedBasicTypes.contains(type)) { System.out.println("Unrecognized type: " + type); System.out.println("in line: " + l); unrecognizedBasicTypes.add(type); } } else if (val.startsWith("\"")) { //it is a String updateTriples(sbj, attr, null, STRING); } else { System.out.println("Unexpected line: " + l); } n++; if (n % 1000000 == 0) { System.out.println("Loaded " + (n / 1000000) + "M triples"); } } else { System.out.println("Invalid triple: " + l); } l = in.readLine(); } } System.out.println("Entity value triples: " + entityTriplesSubjects.length); for (String type : literalTriplesSubjects.keySet()) { System.out.println(type + " value triples: " + literalTriplesSubjects.get(type).length); } propertyUri = new String[propertyIdFromUri.size() + 1]; for (Map.Entry<String, Integer> e : propertyIdFromUri.entrySet()) { propertyUri[e.getValue()] = e.getKey(); } entityUriWithPrefix = new String[entityIdFromUriWithPrefix.size() + 1]; for (Map.Entry<String, Integer> e : entityIdFromUriWithPrefix.entrySet()) { entityUriWithPrefix[e.getValue()] = e.getKey(); } entityLabels = new HashSet[entityIdFromUriWithPrefix.size() + 1]; //entityLabels = new HashSet[entityIdFromUri.size() + 1]; entityClasses = new HashSet[entityIdFromUriWithPrefix.size() + 1]; //entityClasses = new HashSet[entityIdFromUri.size() + 1]; propertyLabels = new HashSet[propertyIdFromUri.size() + 1]; entityOutProperties = new HashSet[entityIdFromUriWithPrefix.size() + 1]; //entityOutProperties = new HashSet[entityIdFromUri.size() + 1]; entityInProperties = new HashSet[entityIdFromUriWithPrefix.size() + 1]; //entityInProperties = new HashSet[entityIdFromUri.size() + 1]; propertyOutProperties = new HashSet[propertyIdFromUri.size() + 1]; propertyInProperties = new HashSet[propertyIdFromUri.size() + 1]; propertyHasLiteralRange = new boolean[propertyIdFromUri.size() + 1]; propertyCount = new int[propertyIdFromUri.size() + 1]; } private void processePropertyLabelsFile(String fileName) throws Exception { try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + fileName))) { String l = in.readLine(); while (l != null) { if (l.length() > 0) { StringTokenizer st = new StringTokenizer(l, "\t<>"); String uri = st.nextToken().trim(); Integer id = propertyIdFromUri.get(uri); if (id != null) { //we ignore the labels of properties not used in triples try { String label = st.nextToken().trim(); System.out.println("label " + label); //!!! if (propertyLabels[id] == null) { propertyLabels[id] = new HashSet<>(); } propertyLabels[id].add(label); if (uri.endsWith("Inv")) { System.out.println("Label \"" + label + "\" for inverted property " + uri); } Integer idInv = propertyIdFromUri.get(uri + "Inv"); if (idInv != null) { if (propertyLabels[idInv] == null) { propertyLabels[idInv] = new HashSet<>(); } propertyLabels[idInv].add(label + " [inverted]"); } } catch (Exception e) { System.out.println("Line: " + l); } } else { System.out.println("prop without id " + uri); } } l = in.readLine(); } } } private void loadPropertyLabels() throws Exception { System.out.println("Loading property labels"); processePropertyLabelsFile("property_labels"); //qui // // for (int i = 1; i < propertyLabels.length; i++) { // if (propertyLabels[i] != null) { // System.out.println("propertyLabels[" + i + "]" + propertyLabels[i]); // } else { // System.out.println("propertyLabels[" + i + "]" + " null"); // } // } // // for (int i = 1; i < propertyUri.length; i++) { // if (propertyUri[i] != null) { // System.out.println("propertyUri[" + i + "]" + propertyUri[i]); // } else { // System.out.println("propertyUri[" + i + "]" + " null"); // } // } // //now, we drop the propertys without a label from the map of uri -> id for (int i = 1; i < propertyLabels.length; i++) { if (propertyLabels[i] == null) { // System.out.println("removed uri: " + propertyUri[i] + " " + i); propertyIdFromUri.remove(propertyUri[i]); propertyUri[i] = null; } } } private void processClassLabelsFile(String fileName, ArrayList<HashSet<String>> labels) throws Exception { try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + fileName))) { String l = in.readLine(); while (l != null) { if (l.length() > 0) { StringTokenizer st = new StringTokenizer(l, "\t<>"); try { String uri = st.nextToken().trim(); String label = st.nextToken().trim(); //System.out.println(uri + "\t" + label); if (!classIdFromUri.containsKey(uri)) { classIdFromUri.put(uri, labels.size() + 1); labels.add(new HashSet<String>()); labels.get(labels.size() - 1).add(label); } else { labels.get(classIdFromUri.get(uri) - 1).add(label); } } catch (Exception e) { System.out.println("Error with line " + l); e.printStackTrace(); } } l = in.readLine(); } } } private void loadClassLabels() throws Exception { ArrayList<HashSet<String>> labels = new ArrayList<>(); classIdFromUri.put(THING, 1); thingId = 1; labels.add(new HashSet<String>()); labels.get(0).add("thing"); System.out.println("Loading class labels"); processClassLabelsFile("class_labels", labels); classLabels = new HashSet[labels.size() + 1]; int i = 1; for (HashSet<String> l : labels) { classLabels[i] = l; i++; } classUri = new String[classIdFromUri.size() + 1]; for (Map.Entry<String, Integer> e : classIdFromUri.entrySet()) { classUri[e.getValue()] = e.getKey(); } } /* * Compute the set of ancestors of a class */ private void computeClassAncestors(int cId) { if (classAncestors[cId] != null) { return; //it was already computed } classAncestors[cId] = new HashSet<>(); for (Integer pId : classParents[cId]) { //the parent is an ancestor classAncestors[cId].add(pId); computeClassAncestors(pId); //and the ancestors of the parent are ancestors as well classAncestors[cId].addAll(classAncestors[pId]); } } /* * Compute the set of descendants of a class */ private void computeClassDescendants(int cId) { if (classDescendants[cId] != null) { return; //it was already computed } classDescendants[cId] = new HashSet<>(); for (Integer pId : classChildren[cId]) { //the parent is an ancestor classDescendants[cId].add(pId); computeClassDescendants(pId); //and the ancestors of the parent are ancestors as well classDescendants[cId].addAll(classDescendants[pId]); } } private void loadClassHierarchy() throws Exception { System.out.println("Loading class parents and building the hierarchy"); //firs, we initialize class parents classParents = new HashSet[classIdFromUri.size() + 1]; for (int i = 1; i < classParents.length; i++) { classParents[i] = new HashSet<>(); //we don't initialize class ancestors because the null value is used to check if the class has not been processed yet } try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + "class_parents"))) { String l = in.readLine(); while (l != null) { if (l.length() > 0) { StringTokenizer st = new StringTokenizer(l, "\t<>"); String claz = st.nextToken().trim(); String parent = st.nextToken().trim(); //we are interested only in the hierarchical relationships between //classes defined inside our ontology Integer cId = classIdFromUri.get(claz); Integer pId = classIdFromUri.get(parent); if (cId != null && pId != null && !pId.equals(cId)) { classParents[cId].add(pId); } } l = in.readLine(); } //now add Thing to empty sets of parents for (int cId = 1; cId < classParents.length; cId++) { if (classParents[cId].isEmpty()) { classParents[cId].add(thingId); } } classParents[thingId].clear(); //now, for each class compute the set of its ancestors classAncestors = new HashSet[classIdFromUri.size() + 1]; for (int cId = 1; cId < classAncestors.length; cId++) { computeClassAncestors(cId); } //now, reduce the set of class parents, by keeping only the most specific classes for (int cId = 1; cId < classParents.length; cId++) { HashSet<Integer> currentParents = classParents[cId]; HashSet<Integer> reducedParents = new HashSet<>(); for (Integer pId : currentParents) { //check if reducedParents contains an ancestor of parent, //or if parent is an ancestor of any class in reducedParents boolean add = true; for (Iterator<Integer> it = reducedParents.iterator(); it.hasNext();) { Integer c = it.next(); if (classAncestors[c].contains(pId)) { add = false; //we don't add parent, beacause c is a descendant of parent break; } else if (classAncestors[pId].contains(c)) { it.remove(); //we remove c beacause parent is a descendant of c } } if (add) { reducedParents.add(pId); } } classParents[cId] = reducedParents; } //now, compute the class children for each class classChildren = new HashSet[classIdFromUri.size() + 1]; for (int cId = 1; cId < classChildren.length; cId++) { classChildren[cId] = new HashSet<>(); } for (int cId = 1; cId < classParents.length; cId++) { for (Integer pId : classParents[cId]) { classChildren[pId].add(cId); } } //now compute the class descendants for each class classDescendants = new HashSet[classIdFromUri.size() + 1]; for (int cId = 1; cId < classDescendants.length; cId++) { computeClassDescendants(cId); } } } private void processEntityLabelsFile(String fileName) throws Exception { try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + fileName))) { String l = in.readLine(); while (l != null) { if (l.length() > 0) { StringTokenizer st = new StringTokenizer(l, "\t<>"); String uri = st.nextToken(); Integer id = getEntityIdFromUri(uri); //entityIdFromUri.get(uri); if (id != null) { //we ignore the labels of entities not used in triples try { String label = st.nextToken(); //System.out.println(uri + "\t" + label); if (entityLabels[id] == null) { entityLabels[id] = new HashSet<>(); } entityLabels[id].add(label); } catch (Exception e) { System.out.println("Failed to add label: " + l); } } else { //System.out.println("Ignored label of "+uri); } } l = in.readLine(); } } } private void loadEntityLabels() throws Exception { System.out.println("Loading entity labels"); processEntityLabelsFile("entity_labels"); //now, we drop the entities without a label from the map of uri -> id for (int i = 1; i < entityLabels.length; i++) { if (entityLabels[i] == null) { entityLabels[i] = new HashSet<>(); entityLabels[i].add(entityUriWithPrefix[i]); } } } private void loadEntityClasses() throws Exception { System.out.println("Loading entity classes, and keeping only the most specific"); int count = 0; HashSet<Integer> notEmptyClasses = new HashSet<>(); try (BufferedReader in = new BufferedReader(new FileReader(basePathInput + "entity_classes"))) { String l = in.readLine(); while (l != null) { try { StringTokenizer st = new StringTokenizer(l, "\t<>"); String uriE = st.nextToken(); String uriC = st.nextToken(); Integer idE = getEntityIdFromUri(uriE);//entityIdFromUri.get(uriE); Integer idC = classIdFromUri.get(uriC); if (!uriC.equals(THING) && idE != null && idC != null && entityLabels[idE] != null && classLabels[idC] != null) { //we ignore the classes without label and the classes of entities not used in triples //we also ignore thing as class, since every entity is implicitly a thing HashSet<Integer> classes = entityClasses[idE]; if (classes == null) { classes = new HashSet<>(); entityClasses[idE] = classes; count++; } //check if classes contains an ancestor of uriC, //or if uriC is an ancestor of any class in classes boolean add = true; for (Iterator<Integer> it = classes.iterator(); it.hasNext();) { Integer c = it.next(); if (classAncestors[c].contains(idC)) { add = false; //we don't add class, beacause c is a descendant of class break; } else if (classAncestors[idC].contains(c)) { it.remove(); //we remove c beacause uriC is a descendant of c } } if (add) { classes.add(idC); notEmptyClasses.add(idC); } } } catch (Exception e) { System.out.println("Failed to load class: " + l); } l = in.readLine(); } } System.out.println(count + " entities have been assigned a non-thing class"); count = 0; //now, set Thing as class of entities without a class for (int i = 1; i < entityClasses.length; i++) { if (entityClasses[i] == null && entityLabels[i] != null) { entityClasses[i] = new HashSet<>(); entityClasses[i].add(thingId); count++; } } System.out.println(count + " entities have been assigned thing class"); //now drop the classes without entities and without descendant classes - asking for those classes would produce empty results could confuse the user for (int i = 1; i < classLabels.length; i++) { if (i != thingId && !notEmptyClasses.contains(i) && classDescendants[i].isEmpty()) { classLabels[i] = null; classUri[i] = null; } } } private void updateOutAndInEntityAndLiteralTypeProperties(Integer property, Integer subj, Integer entityVal, String literalType) { if (entityOutProperties[subj] == null) { entityOutProperties[subj] = new HashSet<>(); } entityOutProperties[subj].add(property); if (entityVal != null) { if (entityInProperties[entityVal] == null) { entityInProperties[entityVal] = new HashSet<>(); } entityInProperties[entityVal].add(property); } if (literalType != null) { literalTypesInProperties.get(literalType).add(property); } } private void propagatePropertiesToAncestorClasses(HashSet<Integer>[] classProperties, int claz) { for (int childClass : classChildren[claz]) { propagatePropertiesToAncestorClasses(classProperties, childClass); } HashSet<Integer> properties = classProperties[claz]; if (properties == null) { properties = new HashSet<>(); classProperties[claz] = properties; } for (Integer child : classChildren[claz]) { properties.addAll(classProperties[child]); } } private void processTriples() throws Exception { int droppedEntityTriples = 0; int droppedLiteralTriples = 0; try (PrintWriter out = new PrintWriter(new FileOutputStream(basePathInput + "dropped_triples", false), true)) { System.out.println("Dropping triples with undefined elements"); for (int i = 0; i < entityTriplesSubjects.length; i++) { int sbj = entityTriplesSubjects[i]; int attr = entityTriplesProperties[i]; int val = entityTriplesValues[i]; if (entityUriWithPrefix[sbj] == null || entityUriWithPrefix[val] == null || propertyUri[attr] == null) { out.println( entityUriWithPrefix[sbj] + "\t" + propertyUri[attr] + "\t" + entityUriWithPrefix[val]); entityTriplesSubjects[i] = 0; droppedEntityTriples++; } else { propertyCount[attr]++; } } for (String type : literalTypes) { for (int i = 0; i < literalTriplesSubjects.get(type).length; i++) { int sbj = literalTriplesSubjects.get(type)[i]; int attr = literalTriplesProperties.get(type)[i]; if (entityUriWithPrefix[sbj] == null || propertyUri[attr] == null) { out.println(entityUriWithPrefix[sbj] + "\t" + propertyUri[attr]); literalTriplesSubjects.get(type)[i] = 0; droppedLiteralTriples++; } else { propertyCount[attr]++; } } } } catch (Exception e) { e.printStackTrace(); } System.out.println("Dropped " + droppedEntityTriples + " triples with entity value and " + droppedLiteralTriples + " with literal value"); System.out.println( "Scanning the triples to compute out-propertys of entities and in-propertys of entities and literal basic types"); //first compute out-propertys and in-propertys of entities //and in-propertys of basic types //<sbj, attr, val>, where val is an entity -> add attr to out-propertys of sbj and in-propertys of val int c = 0; for (int i = 0; i < entityTriplesSubjects.length; i++) { if (entityTriplesSubjects[i] == 0) { //it was previously dropped continue; } int sbj = entityTriplesSubjects[i]; int attr = entityTriplesProperties[i]; int val = entityTriplesValues[i]; if (entityUriWithPrefix[sbj] != null && propertyUri[attr] != null && entityUriWithPrefix[val] != null) { updateOutAndInEntityAndLiteralTypeProperties(attr, sbj, val, null); } c++; if (c % 1000000 == 0) { System.out.println("Processed " + (c / 1000000) + "M triples"); } } entityTriplesSubjects = null; entityTriplesProperties = null; entityTriplesValues = null; System.gc(); //<sbj, attr, type>, where type is a basic type -> add attr to out-propertys of sbj and in-propertys of type for (String literalType : literalTypes) { for (int i = 0; i < literalTriplesSubjects.get(literalType).length; i++) { if (literalTriplesSubjects.get(literalType)[i] == 0) { //it was previously dropped continue; } int sbj = literalTriplesSubjects.get(literalType)[i]; int attr = literalTriplesProperties.get(literalType)[i]; if (entityUriWithPrefix[sbj] != null && propertyUri[attr] != null) {//if (entityUri[sbj] != null && propertyUri[attr] != null) { updateOutAndInEntityAndLiteralTypeProperties(attr, sbj, null, literalType); propertyHasLiteralRange[attr] = true; } c++; if (c % 1000000 == 0) { System.out.println("Processed " + (c / 1000000) + "M triples"); } } } literalTriplesSubjects = null; literalTriplesProperties = null; System.gc(); System.out.println("Scanning the entity out-propertys to compute out-propertys of classes"); //now it is possible to compute the out-propertys and in-propertys of classes //entityOutProperties of e contains a -> add a to classOutProperties of all the classes of e classOutProperties = new HashSet[classUri.length]; for (int i = 1; i < entityOutProperties.length; i++) { if (entityOutProperties[i] != null && entityClasses[i] != null) { for (int property : entityOutProperties[i]) { for (int claz : entityClasses[i]) { if (classOutProperties[claz] == null) { classOutProperties[claz] = new HashSet<>(); } classOutProperties[claz].add(property); } } } } System.out.println("Propagating the out-propertys to ancestor classes"); propagatePropertiesToAncestorClasses(classOutProperties, thingId); if (printFiles) { System.out.println("Writing the classOutProperties"); try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "class_out_propertys", false), true)) { for (int i = 1; i < classOutProperties.length; i++) { if (i % 10 == 0) { out.flush(); } if (classOutProperties[i] != null) { out.print(classUri[i]); for (Integer a : classOutProperties[i]) { out.print("\t" + propertyUri[a]); } out.println(); } } } } System.out.println("Scanning the entity in-propertys to compute in-propertys of classes"); //entityInProperties of e contains a -> add a to classInProperties of all the classes of e classInProperties = new HashSet[classUri.length]; for (int i = 1; i < entityInProperties.length; i++) { if (entityInProperties[i] != null && entityClasses[i] != null) { for (int property : entityInProperties[i]) { for (int claz : entityClasses[i]) { if (classInProperties[claz] == null) { classInProperties[claz] = new HashSet<>(); } classInProperties[claz].add(property); } } } } System.out.println("Propagating the in-propertys to ancestor classes"); propagatePropertiesToAncestorClasses(classOutProperties, thingId); if (printFiles) { System.out.println("Writing the classInProperties"); try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "class_in_propertys", false), true)) { for (int i = 1; i < classInProperties.length; i++) { if (i % 10 == 0) { out.flush(); } if (classInProperties[i] != null) { out.print(classUri[i]); for (Integer a : classInProperties[i]) { out.print("\t" + propertyUri[a]); } out.println(); } } } } System.out.println("Scanning the triples to compute out- and in-propertys of propertys"); //now it is possible to compute the out-propertys and in-propertys of propertys //<t[0], t[1], t[2]> -> add t[1] to outProperties[property] for each property in entityInProperties[t[0]] for (int entity = 1; entity < entityInProperties.length; entity++) { if (entityInProperties[entity] != null) { for (int property : entityInProperties[entity]) { if (entityOutProperties[entity] != null && !entityOutProperties[entity].isEmpty()) { if (propertyOutProperties[property] == null) { propertyOutProperties[property] = new HashSet<>(); } propertyOutProperties[property].addAll(entityOutProperties[entity]); } if (propertyInProperties[property] == null) { propertyInProperties[property] = new HashSet<>(); } propertyInProperties[property].addAll(entityInProperties[entity]); } } } //I will use the literalTypesInProperties when I index the property with rangeOf //write the in/Out-Entity/Class-Properties if (printFiles) { System.out.println("Writing the entityInProperties"); try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "entity_in_propertys", false), true)) { for (int i = 1; i < entityInProperties.length; i++) { if (i % 100 == 0) { out.flush(); } if (entityInProperties[i] != null) { out.print(entityUriWithPrefix[i]);//out.print(entityUri[i]); for (Integer a : entityInProperties[i]) { out.print("\t" + propertyUri[a]); } out.println(); if (i % 100000 == 0) { System.out.print(entityLabels[i] + " - inProperties: "); for (Integer a : entityInProperties[i]) { System.out.print("\t" + propertyLabels[a]); } System.out.println(); } } } } } if (printFiles) { System.out.println("Writing the entityOutProperties"); try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "entity_out_propertys", false), true)) { for (int i = 1; i < entityOutProperties.length; i++) { if (i % 100 == 0) { out.flush(); } if (entityOutProperties[i] != null) { out.print(entityUriWithPrefix[i]);//out.print(entityUri[i]); for (Integer a : entityOutProperties[i]) { out.print("\t" + propertyUri[a]); } out.println(); if (i % 100000 == 0) { System.out.print(entityLabels[i] + " - outProperties: "); for (Integer a : entityOutProperties[i]) { System.out.print("\t" + propertyLabels[a]); } System.out.println(); } } } } } //write the literalInProperties if (printFiles) { try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "literal_types_in_propertys", false), true)) { for (Map.Entry<String, HashSet<Integer>> e : literalTypesInProperties.entrySet()) { out.print(e.getKey()); for (Integer a : e.getValue()) { out.print("\t" + propertyUri[a]); } out.println(); } } } if (printFiles) { try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "property_in_propertys", false), true)) { for (int property = 1; property < propertyUri.length; property++) { if (property % 10 == 0) { out.flush(); } if (propertyInProperties[property] != null && !propertyInProperties[property].isEmpty()) { out.print(propertyUri[property]); for (Integer a : propertyInProperties[property]) { out.print("\t" + propertyUri[a]); } out.println(); } } } } if (printFiles) { try (PrintWriter out = new PrintWriter( new FileOutputStream(basePathOutput + "property_out_propertys", false), true)) { for (int property = 1; property < propertyUri.length; property++) { if (property % 10 == 0) { out.flush(); } if (propertyOutProperties[property] != null && !propertyOutProperties[property].isEmpty()) { out.print(propertyUri[property]); for (Integer a : propertyOutProperties[property]) { out.print("\t" + propertyUri[a]); } out.println(); } } } } } private static void indexOntologyElement(IndexWriter writer, OntologyElementToken e, Collection<String> domainOf, Collection<String> rangeOf, Collection<String> extendedDomain) throws Exception { Document doc = new Document(); doc.add(new Field("label", e.getLabel(), TextField.TYPE_NOT_STORED)); doc.add(new IntField("id", e.getId(), IntField.TYPE_STORED)); doc.add(new Field("type", e.getType(), StringField.TYPE_NOT_STORED)); if (domainOf != null) { for (String d : domainOf) { //the first element is the URI doc.add(new Field("domainOfProperty", d, StringField.TYPE_NOT_STORED)); } } if (rangeOf != null) { for (String r : rangeOf) { //the first element is the URI doc.add(new Field("rangeOfProperty", r, StringField.TYPE_NOT_STORED)); } } if (extendedDomain != null) { for (String d : extendedDomain) { //the first element is the URI doc.add(new Field("propertyDomain", d, StringField.TYPE_NOT_STORED)); } } writer.addDocument(doc); } //2 // private static void indexOntologyElement(IndexWriter writer, OntologyElementToken e, Collection<String> domainOf, Collection<String> rangeOf, Collection<String> extendedDomain) throws Exception { // Document doc = new Document(); // doc.add(new Field("label", e.getLabel(), TextField.TYPE_STORED)); // doc.add(new LegacyIntField("id", e.getId(), LegacyIntField.TYPE_STORED)); // doc.add(new Field("type", e.getType(), StringField.TYPE_STORED)); // if (domainOf != null) { // for (String d : domainOf) { //the first element is the URI // doc.add(new Field("domainOfProperty", d, StringField.TYPE_NOT_STORED)); // } // } // if (rangeOf != null) { // for (String r : rangeOf) { //the first element is the URI // doc.add(new Field("rangeOfProperty", r, StringField.TYPE_NOT_STORED)); // } // } // if (extendedDomain != null) { // for (String d : extendedDomain) { //the first element is the URI // doc.add(new Field("propertyDomain", d, StringField.TYPE_STORED)); // } // } // writer.addDocument(doc); // } private void indexEntities(IndexWriter writer, HashMap<Integer, IndexedToken> elements) throws Exception { for (int i = 1; i < entityUriWithPrefix.length; i++) {//for (int i = 1; i < entityUri.length; i++) { if (entityUriWithPrefix[i] != null) {//if (entityUri[i] != null) { HashSet<String> domainOf = new HashSet<>(); HashSet<String> rangeOf = new HashSet<>(); if (entityOutProperties[i] != null) { for (int a : entityOutProperties[i]) { domainOf.add(propertyUri[a]); } } if (entityInProperties[i] != null) { for (int a : entityInProperties[i]) { rangeOf.add(propertyUri[a]); } } for (String label : entityLabels[i]) { EntityToken element = new EntityToken(entityUriWithPrefix[i], label, false);//EntityToken element = new EntityToken(entityUri[i], label); indexOntologyElement(writer, element, domainOf, rangeOf, null); elements.put(element.getId(), element); } } } entityOutProperties = null; entityInProperties = null; System.gc(); } private void indexClasses(IndexWriter writer, HashMap<Integer, IndexedToken> elements) throws Exception { HashSet<Character> vowels = new HashSet<>(); vowels.add('a'); vowels.add('e'); vowels.add('i'); vowels.add('o'); vowels.add('u'); for (int i = 1; i < classUri.length; i++) { if (classUri[i] != null) { HashSet<String> domainOf = new HashSet<>(); HashSet<String> rangeOf = new HashSet<>(); if (classOutProperties[i] == null) { classOutProperties[i] = new HashSet<>(); } for (int a : classOutProperties[i]) { domainOf.add(propertyUri[a]); } if (classInProperties[i] == null) { classInProperties[i] = new HashSet<>(); } for (int a : classInProperties[i]) { rangeOf.add(propertyUri[a]); } for (String label : classLabels[i]) { label = label.toLowerCase(); ClassToken elementSingular = new ClassToken(classUri[i], label, IndexedToken.SINGULAR, false); indexOntologyElement(writer, elementSingular, domainOf, rangeOf, null); elements.put(elementSingular.getId(), elementSingular); //now create the plural form String pLabel; if (label.endsWith("y") && !vowels.contains(label.charAt(label.length() - 2))) { pLabel = label.substring(0, label.length() - 1) + "ies"; } else if (label.endsWith("s") || label.endsWith("sh") || label.endsWith("ch") || label.endsWith("x") || label.endsWith("z")) { pLabel = label + "es"; } else if (label.equals("person")) { pLabel = "people"; } else { pLabel = label + "s"; } ClassToken elementPlural = new ClassToken(classUri[i], pLabel, IndexedToken.PLURAL, false); indexOntologyElement(writer, elementPlural, domainOf, rangeOf, null); elements.put(elementPlural.getId(), elementPlural); } } } } private void indexProperties(IndexWriter writer, HashMap<Integer, IndexedToken> elements) throws Exception { //precompute the domains of properties HashSet<String>[] propertyDomains = new HashSet[propertyUri.length]; //the domain of a property a is the set of classes and propertys having a in their outProperty for (int claz = 1; claz < classOutProperties.length; claz++) { if (classOutProperties[claz] != null && classUri[claz] != null) { for (int a : classOutProperties[claz]) { if (propertyDomains[a] == null) { propertyDomains[a] = new HashSet<>(); } propertyDomains[a].add(classUri[claz]); } } } for (int property = 1; property < propertyOutProperties.length; property++) { if (propertyOutProperties[property] != null) { for (int a : propertyOutProperties[property]) { if (propertyDomains[a] == null) { propertyDomains[a] = new HashSet<>(); } propertyDomains[a].add(propertyUri[property]); } } } //precompute the literal ranges of every property HashSet<String>[] propertyLiteralRanges = new HashSet[propertyUri.length]; for (int i = 1; i < propertyLiteralRanges.length; i++) { propertyLiteralRanges[i] = new HashSet<>(); } for (String literalType : literalTypesInProperties.keySet()) { for (int property : literalTypesInProperties.get(literalType)) { propertyLiteralRanges[property].add(literalType); } } for (int property = 1; property < propertyUri.length; property++) { if (propertyUri[property] != null) { HashSet<String> domainOf = new HashSet<>(); if (propertyOutProperties[property] != null) { for (int a : propertyOutProperties[property]) { domainOf.add(propertyUri[a]); } } HashSet<String> rangeOf = new HashSet<>(); if (propertyInProperties[property] != null) { for (int a : propertyInProperties[property]) { rangeOf.add(propertyUri[a]); } } for (String type : literalTypes) { if (literalTypesInProperties.get(type).contains(property)) { for (int a : literalTypesInProperties.get(type)) { rangeOf.add(propertyUri[a]); } } } for (String label : propertyLabels[property]) { HashSet<String> aDomains = propertyDomains[property]; PropertyToken element = new PropertyToken(propertyUri[property], label, IndexedToken.UNDEFINED, IndexedToken.UNDEFINED, propertyOutProperties[property] != null && !propertyOutProperties[property].isEmpty(), propertyHasLiteralRange[property], false); indexOntologyElement(writer, element, domainOf, rangeOf, aDomains); element.setPropertyAndClassDomain(aDomains); element.addBasicTypeRanges(propertyLiteralRanges[property]); elements.put(element.getId(), element); } } } classOutProperties = null; classInProperties = null; propertyOutProperties = null; propertyInProperties = null; System.gc(); } public void start() throws Exception { long t = System.currentTimeMillis(); //loadBasicTypesMapping(); System.out.println(System.currentTimeMillis() - t + " ms."); t = System.currentTimeMillis(); loadTriples(); System.out.println(System.currentTimeMillis() - t + " ms."); t = System.currentTimeMillis(); loadPropertyLabels(); System.out.println(System.currentTimeMillis() - t + " ms."); t = System.currentTimeMillis(); loadClassLabels(); System.out.println(System.currentTimeMillis() - t + " ms."); t = System.currentTimeMillis(); loadClassHierarchy(); System.out.println(System.currentTimeMillis() - t + " ms."); t = System.currentTimeMillis(); loadEntityLabels(); System.out.println(System.currentTimeMillis() - t + " ms."); t = System.currentTimeMillis(); loadEntityClasses(); System.out.println(System.currentTimeMillis() - t + " ms."); t = System.currentTimeMillis(); entityIdFromUriWithPrefix = null; classIdFromUri = null; propertyIdFromUri = null; System.gc(); processTriples(); System.out.println(System.currentTimeMillis() - t + " ms."); t = System.currentTimeMillis(); HashMap<String, Analyzer> analyzerMap = new HashMap<>(); analyzerMap.put("label", new EnglishAnalyzer(CharArraySet.EMPTY_SET)); analyzerMap.put("id", new WhitespaceAnalyzer()); analyzerMap.put("type", new WhitespaceAnalyzer()); analyzerMap.put("domainOfProperty", new WhitespaceAnalyzer()); analyzerMap.put("rangeOfProperty", new WhitespaceAnalyzer()); analyzerMap.put("propertyDomain", new WhitespaceAnalyzer()); Analyzer analyzer = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), analyzerMap); HashMap<Integer, IndexedToken> elements = new HashMap<>(); try (FSDirectory directory = FSDirectory.open(Paths.get(basePathOutput + "lucene"))) { IndexWriterConfig iwc = new IndexWriterConfig(analyzer); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); try (IndexWriter writer = new IndexWriter(directory, iwc)) { System.out.println("Indexing entities"); indexEntities(writer, elements); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); System.out.println("Indexing classes"); indexClasses(writer, elements); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); System.out.println("Indexing propertys"); indexProperties(writer, elements); System.out.println(System.currentTimeMillis() - t); t = System.currentTimeMillis(); } catch (Exception e) { e.printStackTrace(); } } //save elements to file System.out.println("Creating the trie"); Trie trie = new Trie(); int c = 0; for (IndexedToken it : elements.values()) { trie.add(it.getText()); c++; if (c % 100000 == 0) { System.out.println(c + " elements added to the trie"); } } System.out.println(c + " elements added to the trie"); c = 0; for (IndexedToken it : elements.values()) { String suffix = trie.getOneSuffix(it.getText()); if (suffix != null) { it.setPrefix(true); c++; } } System.out.println(c + " are prefix of another element"); System.out.println("Serializing the tokens"); try (ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(basePathOutput + "elements"))) { oos.writeObject(elements); oos.writeInt(IndexedToken.counter); } } public static void main(String[] args) throws Exception { String fn1 = null, fn2 = null; if (args != null && args.length == 2) { fn1 = args[0]; fn2 = args[1]; } else { //fn1 = "/home/gaetangate/Dev/nlp2sparql-data/dbpedia-processed/2015-10/supportFiles/"; //fn2 = "/home/gaetangate/Dev/nlp2sparql-data/dbpedia-processed/2015-10/index/"; fn1 = "/home/lucia/data/seodwarf/index/supportFiles/"; fn2 = "/home/lucia/data/seodwarf/index/processed/"; } long start = System.currentTimeMillis(); System.out.println("BUILD MINI INDEX"); System.out.println("Started at " + new Date()); new BuildSeoDwarfIndex(fn1, fn2).start(); System.out.println("Ended at " + new Date()); long time = System.currentTimeMillis() - start; System.out.println("Indexing process finished in " + time / (double) 1000 + " sec."); } }