List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:at.ac.univie.mminf.luceneSKOS.analysis.engine.jena.SKOSEngineImpl.java
License:Apache License
/** * Creates the synonym index//ww w. j av a 2s.c o m * * @throws IOException */ private void indexSKOSModel() throws IOException { IndexWriterConfig cfg = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(indexDir, cfg); writer.getConfig().setRAMBufferSizeMB(48); /* iterate SKOS concepts, create Lucene docs and add them to the index */ ResIterator concept_iter = skosModel.listResourcesWithProperty(RDF.type, SKOS.Concept); while (concept_iter.hasNext()) { Resource skos_concept = concept_iter.next(); Document concept_doc = createDocumentsFromConcept(skos_concept); writer.addDocument(concept_doc); } writer.close(); }
From source file:at.ac.univie.mminf.luceneSKOS.skos.impl.SKOSEngineImpl.java
License:Apache License
/** * Creates the synonym index/* w w w .j av a2 s . c o m*/ * * @throws IOException */ private void indexSKOSModel() throws IOException { IndexWriterConfig cfg = new IndexWriterConfig(matchVersion, analyzer); IndexWriter writer = new IndexWriter(indexDir, cfg); writer.getConfig().setRAMBufferSizeMB(48); /* iterate SKOS concepts, create Lucene docs and add them to the index */ ResIterator concept_iter = skosModel.listResourcesWithProperty(RDF.type, SKOS.Concept); while (concept_iter.hasNext()) { Resource skos_concept = concept_iter.next(); Document concept_doc = createDocumentsFromConcept(skos_concept); // System.out.println("Adding document to index " + concept_doc); writer.addDocument(concept_doc); } writer.close(); }
From source file:at.lux.fotoretrieval.retrievalengines.LucenePathIndexRetrievalEngine.java
License:Open Source License
public void indexFilesSemantically(String pathToIndex, StatusBar statusBar) { if (statusBar != null) statusBar.setStatus("Creating index from semantic annotations"); SAXBuilder builder = new SAXBuilder(); XMLOutputter outputter = new XMLOutputter( Format.getRawFormat().setIndent("").setLineSeparator("").setExpandEmptyElements(false)); try {//w ww . j a v a2s . c om String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true); if (descriptions == null) return; float numAllDocsPercent = (float) descriptions.length / 100f; DecimalFormat df = (DecimalFormat) NumberFormat.getInstance(); df.setMaximumFractionDigits(1); // Preparing objects for the index: HashMap<String, ElementEntry> elementMap = new HashMap<String, ElementEntry>(descriptions.length); HashMap<Element, LinkedList<String>> element2document = new HashMap<Element, LinkedList<String>>( descriptions.length); // in the first run we identify the semantic objects that we want to index and build // a table were we can relate them to the documents (identified by their path) for (int i = 0; i < descriptions.length; i++) { try { Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement(); List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null); for (Object aL : l) { Element semanticElement = (Element) aL; String xmlString = outputter.outputString(semanticElement).trim() .replaceAll("id=\"id_[0-9]*\"", ""); // check if element is already there, indicator is its string representation. if (!elementMap.keySet().contains(xmlString)) { // its not here, put it in. elementMap.put(xmlString, new ElementEntry(semanticElement, elementMap.size())); // System.out.println(xmlString); } // now get the unified element semanticElement = elementMap.get(xmlString).semanticElement; // and check if there is an entry in the table for where to find the element if (!element2document.keySet().contains(semanticElement)) { element2document.put(semanticElement, new LinkedList<String>()); } // and add found document if not already there: List documentList = element2document.get(semanticElement); if (!documentList.contains(descriptions[i])) documentList.add(descriptions[i]); } if (statusBar != null) statusBar.setStatus( "Parsing documents for nodes: " + df.format((float) i / numAllDocsPercent)); } catch (JDOMException e1) { System.err.println("Exception in document #" + i + ": " + e1.getMessage()); } catch (IOException e1) { e1.printStackTrace(); } } // read stats: // System.out.println("Got " + countOverallElements + " Elements in " + descriptions.length + " descriptions, " + elementMap.size() + " elements are pairwise different."); // Now we can add the nodes to a lucene index: // fields: label, id, type, files (separated by '|'), xml, all // ------------------------------------------- // opening the index for writing: boolean createFlag = true; String indexDir = parseSemanticIndexDirectory(pathToIndex); Analyzer analyzer = new StandardAnalyzer(); IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag); if (statusBar != null) statusBar.setStatus("Creating index for " + element2document.size() + " different available nodes"); // iterating through nodes and storing them: for (Element semElement : element2document.keySet()) { // needed for later XPath :( otherwise everthing in the whole document is retrieved. String fileList = getFileListFromNode(element2document.get(semElement)); Document idxDocument = new Document(); // adding the file itself ... idxDocument.add(new Field("files", fileList, Field.Store.YES, Field.Index.NO)); // System.out.println(((Element) o).getTextTrim()); // StringBuilder all = new StringBuilder(255); // adding the label // addToDocument(idxDocument, semElement, "//Label/Name", "label", all); String elementLabel = semElement.getChild("Label", semElement.getNamespace()) .getChildTextTrim("Name", semElement.getNamespace()); Field labelField = new Field("label", elementLabel, Field.Store.YES, Field.Index.TOKENIZED); labelField.setBoost(1.2f); idxDocument.add(labelField); // adding the type: String elementType = semElement.getAttribute("type", xsi).getValue().trim(); idxDocument.add(new Field("type", elementType, Field.Store.YES, Field.Index.NO)); // adding the XML contents: String xmlString = outputter.outputString(semElement); idxDocument.add(new Field("xml", xmlString, Field.Store.YES, Field.Index.NO)); // adding the id: idxDocument.add( new Field("id", elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id + "", Field.Store.YES, Field.Index.UN_TOKENIZED)); // TODO: split the indexing for objects based on type: // adding all, unstored for retrieval only if (elementType.equals("AgentObjectType")) { createIndexDocumentFromSemanticAgent(semElement, idxDocument); } else if (elementType.equals("EventType")) { createIndexDocumentFromSemanticElement(semElement, idxDocument); } else if (elementType.equals("SemanticPlaceType")) { createIndexDocumentFromSemanticElement(semElement, idxDocument); } else if (elementType.equals("SemanticTimeType")) { createIndexDocumentFromSemanticElement(semElement, idxDocument); } else { createIndexDocumentFromSemanticElement(semElement, idxDocument); } writer.addDocument(idxDocument); } // now optimize and close the index: // todo: open index for appending and/or updating writer.optimize(); writer.close(); // Now we can create the powerset for each existing graph // (based on sorted node ids) and store // all resulting graphs within an index. // ---------------------------------------------------------- if (statusBar != null) statusBar.setStatus("Creating and merging of available graphs"); HashMap<Graph, HashSet<String>> graph2document = new HashMap<Graph, HashSet<String>>( descriptions.length); for (int i = 0; i < descriptions.length; i++) try { Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement(); List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null); HashMap<String, Integer> docID2overallID = new HashMap<String, Integer>(l.size()); LinkedList<Relation> relations = new LinkedList<Relation>(); LinkedList<Integer> nodes = new LinkedList<Integer>(); for (Object aL : l) { Element semanticElement = (Element) aL; String xmlString = outputter.outputString(semanticElement); int id = elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id; String docID = semanticElement.getAttribute("id").getValue(); docID2overallID.put(docID, id); nodes.add(id); } // get all relations with global ids and eliminate inverse relations l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null); for (Object aL1 : l) { Element relation = (Element) aL1; int source = docID2overallID.get(relation.getAttribute("source").getValue().substring(1)); int target = docID2overallID.get(relation.getAttribute("target").getValue().substring(1)); String type = relation.getAttribute("type").getValue(); type = type.substring(type.lastIndexOf(':') + 1); Relation r = eliminateInverse(new Relation(source, target, type)); relations.add(r); } // now create a graph object Collections.sort(nodes); Collections.sort(relations); LinkedList<Node> nodeList = new LinkedList<Node>(); for (Integer node : nodes) { nodeList.add(new Node(node)); } Graph g = new Graph(nodeList, relations); HashSet<String> docs = new HashSet<String>(1); docs.add(descriptions[i]); graph2document.put(g, docs); } catch (JDOMException e1) { System.err.println(new StringBuilder().append("Exception in document #").append(i).append(": ") .append(e1.getMessage()).toString()); } HashMap<String, Graph> str2graph = new HashMap<String, Graph>(graph2document.size() / 2); HashMap<Graph, HashSet<String>> g2d = new HashMap<Graph, HashSet<String>>(descriptions.length); /* For now we reduce the number of graphs by identifiying and merging duplicates and remove redundant entries: */ for (Graph g : graph2document.keySet()) { if (str2graph.containsKey(g.toString())) { g2d.get(str2graph.get(g.toString())).addAll(graph2document.get(g)); } else { str2graph.put(g.toString(), g); g2d.put(g, graph2document.get(g)); } } graph2document = g2d; System.out.println(graph2document.size() + " non trivial different graphs were found"); // now put all the available graphs into an index: // ----------------------------------------------- // for now we will store a simple text file: if (statusBar != null) statusBar.setStatus("Saving index of paths"); boolean createPathIndexFlag = true; String pathIndexDir = parsePathIndexDirectory(pathToIndex); IndexWriter pathIndexWriter = new IndexWriter(pathIndexDir, new GraphAnalyzer(), createPathIndexFlag); for (Graph graph : graph2document.keySet()) { HashSet<String> files = graph2document.get(graph); Document idxDocument = new Document(); // adding the file itself ... for (String s : files) { idxDocument.add(new Field("file", s, Field.Store.YES, Field.Index.NO)); } // adding the graph ... idxDocument.add(new Field("graph", graph.toString(), Field.Store.YES, Field.Index.TOKENIZED)); // idxDocument.add(Field.UnIndexed("graph", graph.toString())); // adding the paths StringBuilder sb = new StringBuilder(256); sb.append(graph.toString()); List<Path> pathList = (new LabeledGraph(graph)).get2Paths(); if (!pathList.isEmpty()) sb.append(' '); for (Iterator<Path> iterator1 = pathList.iterator(); iterator1.hasNext();) { Path path = iterator1.next(); sb.append(path.toString()); if (iterator1.hasNext()) sb.append(' '); } idxDocument.add(new Field("paths", sb.toString(), Field.Store.YES, Field.Index.TOKENIZED)); pathIndexWriter.addDocument(idxDocument); } // now optimize and close the index: pathIndexWriter.optimize(); pathIndexWriter.close(); } catch (IOException e) { e.printStackTrace(); } }
From source file:at.lux.fotoretrieval.retrievalengines.LuceneRetrievalEngine.java
License:Open Source License
/** * In general we take the base path for our search for the pathToIndex parameter. * we then add the directory "index" and create it there. * * @param pathToIndex/*from w w w. java2s . co m*/ * @param statusBar */ public void indexFiles(String pathToIndex, StatusBar statusBar) { // parsing and eventually creating the directory for the index ... String indexDir = parseFulltextIndexDirectory(pathToIndex); Analyzer analyzer = new StandardAnalyzer(); boolean createFlag = true; SAXBuilder builder = new SAXBuilder(); String prefix = "Creating fulltext index: "; try { IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag); String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true); if (descriptions == null) return; float numAllDocsPercent = (float) descriptions.length / 100f; DecimalFormat df = (DecimalFormat) NumberFormat.getInstance(); df.setMaximumFractionDigits(1); for (int i = 0; i < descriptions.length; i++) { try { Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement(); Document idxDocument = new Document(); // adding the file itself ... idxDocument.add(new Field("file", descriptions[i], Field.Store.YES, Field.Index.NO)); // adding all given names StringBuilder all = new StringBuilder(255); List l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null); // System.out.println("NumberOfRelations: " + l.size()); addToDocument(idxDocument, e, "//Agent/Name/GivenName", "GivenName", all); addToDocument(idxDocument, e, "//Agent/Name/FamilyName", "FamilyName", all); addToDocument(idxDocument, e, "//Label/Name", "Label", all); addToDocument(idxDocument, e, "//FreeTextAnnotation", "FreeTextAnnotation", all); addToDocument(idxDocument, e, "//StructuredAnnotation/Who/Name", "Who", all); addToDocument(idxDocument, e, "//StructuredAnnotation/Where/Name", "Where", all); addToDocument(idxDocument, e, "//StructuredAnnotation/How/Name", "How", all); addToDocument(idxDocument, e, "//StructuredAnnotation/Why/Name", "Why", all); addToDocument(idxDocument, e, "//StructuredAnnotation/When/Name", "When", all); addToDocument(idxDocument, e, "//StructuredAnnotation/WhatObject/Name", "WhatObjects", all); addToDocument(idxDocument, e, "//StructuredAnnotation/WhatAction/Name", "WhatAction", all); idxDocument.add(new Field("all", all.toString(), Field.Store.NO, Field.Index.TOKENIZED)); writer.addDocument(idxDocument); if (statusBar != null) { StringBuilder status = new StringBuilder(13).append(prefix); status.append(df.format(((float) i) / numAllDocsPercent)); status.append('%'); statusBar.setStatus(status.toString()); } } catch (Exception e1) { System.err.println("Error with file " + descriptions[i] + " (" + e1.getMessage() + ")"); } } writer.optimize(); writer.close(); if (statusBar != null) { statusBar.setStatus("Indexing finished"); } } catch (IOException e) { e.printStackTrace(); } }
From source file:at.lux.fotoretrieval.retrievalengines.LuceneRetrievalEngine.java
License:Open Source License
public void indexFilesSemantically(String pathToIndex, StatusBar statusBar) { if (statusBar != null) statusBar.setStatus("Creating index from semantic annotations"); SAXBuilder builder = new SAXBuilder(); XMLOutputter outputter = new XMLOutputter( Format.getRawFormat().setIndent("").setLineSeparator("").setExpandEmptyElements(false)); try {//from w ww. j a v a 2 s .co m String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true); if (descriptions == null) return; float numAllDocsPercent = (float) descriptions.length / 100f; DecimalFormat df = (DecimalFormat) NumberFormat.getInstance(); df.setMaximumFractionDigits(1); // Preparing objects for the index: HashMap<String, ElementEntry> elementMap = new HashMap<String, ElementEntry>(descriptions.length); HashMap<Element, LinkedList<String>> element2document = new HashMap<Element, LinkedList<String>>( descriptions.length); // in the first run we identify the semantic objects that we want to index and build // a table were we can relate them to the documents (identified by their path) for (int i = 0; i < descriptions.length; i++) { try { Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement(); List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null); for (Iterator iterator = l.iterator(); iterator.hasNext();) { Element semanticElement = (Element) iterator.next(); String xmlString = outputter.outputString(semanticElement).trim() .replaceAll("id=\"id_[0-9]*\"", ""); // check if element is already there, indicator is its string representation. if (!elementMap.keySet().contains(xmlString)) { // its not here, put it in. elementMap.put(xmlString, new ElementEntry(semanticElement, elementMap.size())); // System.out.println(xmlString); } // now get the unified element semanticElement = elementMap.get(xmlString).semanticElement; // and check if there is an entry in the table for where to find the element if (!element2document.keySet().contains(semanticElement)) { element2document.put(semanticElement, new LinkedList<String>()); } // and add found document if not already there: List documentList = element2document.get(semanticElement); if (!documentList.contains(descriptions[i])) documentList.add(descriptions[i]); } if (statusBar != null) statusBar.setStatus( "Parsing documents for nodes: " + df.format((float) i / numAllDocsPercent)); } catch (JDOMException e1) { System.err.println("Exception in document #" + i + ": " + e1.getMessage()); } catch (IOException e1) { e1.printStackTrace(); } } // read stats: // System.out.println("Got " + countOverallElements + " Elements in " + descriptions.length + " descriptions, " + elementMap.size() + " elements are pairwise different."); // Now we can add the nodes to a lucene index: // fields: label, id, type, files (separated by '|'), xml, all // ------------------------------------------- // opening the index for writing: boolean createFlag = true; String indexDir = parseSemanticIndexDirectory(pathToIndex); Analyzer analyzer = new StandardAnalyzer(); IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag); if (statusBar != null) statusBar.setStatus("Creating index for " + element2document.size() + " different available nodes"); // iterating through nodes and storing them: for (Iterator<Element> iterator = element2document.keySet().iterator(); iterator.hasNext();) { Element semElement = iterator.next(); // needed for later XPath :( otherwise everthing in the whole document is retrieved. String fileList = getFileListFromNode(element2document.get(semElement)); Document idxDocument = new Document(); // adding the file itself ... idxDocument.add(new Field("files", fileList, Field.Store.YES, Field.Index.NO)); // System.out.println(((Element) o).getTextTrim()); StringBuilder all = new StringBuilder(255); // adding the label // addToDocument(idxDocument, semElement, "//Label/Name", "label", all); String elementLabel = semElement.getChild("Label", semElement.getNamespace()) .getChildTextTrim("Name", semElement.getNamespace()); idxDocument.add(new Field("label", elementLabel, Field.Store.YES, Field.Index.TOKENIZED)); // adding the type: String elementType = semElement.getAttribute("type", xsi).getValue().trim(); idxDocument.add(new Field("type", elementType, Field.Store.YES, Field.Index.NO)); // adding the XML contents: String xmlString = outputter.outputString(semElement); idxDocument.add(new Field("xml", xmlString, Field.Store.YES, Field.Index.NO)); // adding the id: idxDocument.add( new Field("id", elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id + "", Field.Store.YES, Field.Index.NO)); // adding all, unstored for retrieval only List l = RetrievalToolkit.xpathQuery(semElement, "*//*", null); for (Iterator it3 = l.iterator(); it3.hasNext();) { Element e = (Element) it3.next(); all.append(e.getTextTrim()); all.append(" "); } idxDocument.add(new Field("all", all.toString(), Field.Store.NO, Field.Index.TOKENIZED)); writer.addDocument(idxDocument); } // now optimize and close the index: // todo: open index for appending and/or updating writer.optimize(); writer.close(); // Now we can create the powerset for each existing graph // (based on sorted node ids) and store // all resulting graphs within an index. // ---------------------------------------------------------- if (statusBar != null) statusBar.setStatus("Creating and merging powersets of available graphs"); HashMap<Graph, HashSet<String>> graph2document = new HashMap<Graph, HashSet<String>>( descriptions.length); for (int i = 0; i < descriptions.length; i++) { try { Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement(); List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null); HashMap<String, Integer> docID2overallID = new HashMap<String, Integer>(l.size()); LinkedList<Relation> relations = new LinkedList<Relation>(); LinkedList<Integer> nodes = new LinkedList<Integer>(); for (Iterator iterator = l.iterator(); iterator.hasNext();) { Element semanticElement = (Element) iterator.next(); String xmlString = outputter.outputString(semanticElement); int id = elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id; String docID = semanticElement.getAttribute("id").getValue(); docID2overallID.put(docID, id); nodes.add(id); } // get all relations with global ids and eliminate inverse relations l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null); for (Iterator iterator = l.iterator(); iterator.hasNext();) { Element relation = (Element) iterator.next(); int source = docID2overallID.get(relation.getAttribute("source").getValue().substring(1)); int target = docID2overallID.get(relation.getAttribute("target").getValue().substring(1)); String type = relation.getAttribute("type").getValue(); type = type.substring(type.lastIndexOf(':') + 1); Relation r = eliminateInverse(new Relation(source, target, type)); relations.add(r); } // now create a graph object Collections.sort(nodes); Collections.sort(relations); LinkedList<Node> nodeList = new LinkedList<Node>(); for (Iterator<Integer> iterator = nodes.iterator(); iterator.hasNext();) { nodeList.add(new Node(iterator.next())); } Graph g = new Graph(nodeList, relations); // List<Graph> powerSet = new LinkedList<Graph>(); // powerSet.add(g); HashSet<String> docs = new HashSet<String>(1); docs.add(descriptions[i]); graph2document.put(g, docs); /* // add all these subgraphs and the reference to the document to // a data structure: for (Iterator<Graph> iterator = powerSet.iterator(); iterator.hasNext();) { Graph graph = iterator.next(); // List<Graph> relationsPowerSet = graph.getPowerSetOfRelations(); // for (Iterator<Graph> iterator1 = relationsPowerSet.iterator(); iterator1.hasNext();) { // Graph graph1 = iterator1.next(); // } // add graph if not trivial: if (graph.getNodes().size() > 1) { // containsKey for Graph does not match my needs - // different graph objects reference the same graph! if (string2graph.containsKey(graph.toString())) { graph = string2graph.get(graph.toString()); graph2document.get(graph).add(descriptions[i]); } else { HashSet<String> docs = new HashSet<String>(1); docs.add(descriptions[i]); graph2document.put(graph, docs); } } } */ } catch (JDOMException e1) { System.err.println("Exception in document #" + i + ": " + e1.getMessage()); } } HashMap<String, Graph> str2graph = new HashMap<String, Graph>(graph2document.size() / 2); HashMap<Graph, HashSet<String>> g2d = new HashMap<Graph, HashSet<String>>(descriptions.length); /* For now we reduce the number of graphs by identifiying and merging duplicates and remove redundant entries: */ for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) { Graph g = iterator.next(); if (str2graph.containsKey(g.toString())) { g2d.get(str2graph.get(g.toString())).addAll(graph2document.get(g)); } else { str2graph.put(g.toString(), g); g2d.put(g, graph2document.get(g)); } } graph2document = g2d; System.out.println(graph2document.size() + " non trivial different graphs were found"); // now put all the available graphs into an index: // ----------------------------------------------- // todo: create real fast storable index of subgraphs instead of file :-) possible candidate a trie // for now we will store a simple text file: if (statusBar != null) statusBar.setStatus("Storing powersets of available graphs as file"); String indexFile; if (!pathToIndex.endsWith(File.separator)) { indexFile = pathToIndex + File.separator + "idx_graphs.list"; } else { indexFile = pathToIndex + "idx_graphs.list"; } File f = new File(indexFile); BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(f, false)))); for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) { Graph g = iterator.next(); bw.write(g.toString()); for (Iterator<String> iterator1 = graph2document.get(g).iterator(); iterator1.hasNext();) { String s = iterator1.next(); bw.write("|" + s); } bw.write("\n"); } bw.close(); } catch (IOException e) { e.printStackTrace(); } }
From source file:au.edu.unimelb.csse.analyser.NodeCacheTest.java
License:Apache License
public void testReusesNodesWhileIndexing() throws Exception { String[] sents = new String[] { "(A(B C)(D(E F)))", "(A(B(C D)))", "(A(B C)(D(E(F(G H)))))", "(A(B C))" }; String[] jsonSents = new String[sents.length]; String2NodesParser parser = new String2NodesParser(); assertEquals(0, NodeCache.cacheSize()); int[] expectedCounts = new int[] { 0, 2, 0, 5 }; //First sent: 6 nodes are used but they are not returned until the next sentence is read. //Hence the cache still returns a size of 0 //Second sent: 6 nodes are returned back but the new sentence contains 4 nodes //6 - 4 = 2/*from www . j a va 2 s . co m*/ //Third sent: 4 nodes are returned back but the new sentence contains 8 nodes //size shows 0 again //Fourth sent: 8 nodes are returned back but the new sentence contains 3 nodes //8 - 3 = 5 for (int i = 0; i < sents.length; i++) { jsonSents[i] = parser.parse(sents[i]).asJSONString(); assertEquals(expectedCounts[i], NodeCache.cacheSize()); } Analyzer analyser = new NodeTreebankAnalyser(false); RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED); Document d = new Document(); d.add(new Field("sent", jsonSents[0], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); writer.addDocument(d); //No change to Node cache assertEquals(5, NodeCache.cacheSize()); d = new Document(); d.add(new Field("sent", jsonSents[1], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); writer.addDocument(d); //No change to Node cache assertEquals(5, NodeCache.cacheSize()); d = new Document(); d.add(new Field("sent", jsonSents[2], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); writer.addDocument(d); //No change to Node cache assertEquals(5, NodeCache.cacheSize()); d = new Document(); d.add(new Field("sent", jsonSents[3], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); writer.addDocument(d); //No change to Node cache assertEquals(5, NodeCache.cacheSize()); }
From source file:au.edu.unimelb.csse.join.JoinFunctionalTest.java
License:Apache License
/** * This test is actually commented out.. to run the test.. match counting has to be enabled in JoinLogic * @throws Exception/*from w w w. j ava 2 s. com*/ */ public void testNumberOfCallsToMatch() throws Exception { String sent = "(NP" + "(NP" + "(DT The)" + "(NN year))" + "(NP" + "(NP(CD 1956))" + "(PP" + "(IN in)" + "(NP(JJ rugby)(NN union))" + ")" + ")" + "(. .)" + ")"; Analyzer analyser = new FastStringAnalyser(); RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED); Document d = new Document(); d.add(new Field("sent", sent, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); writer.addDocument(d); writer.close(); IndexSearcher searcher = new IndexSearcher(dir); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE, false, 6); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE_WITH_FC, false, 1); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP, false, 2); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, false, 1); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE, true, 6); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE_WITH_FC, true, 5); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP, true, 6); assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, true, 5); assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.SIMPLE, false, 23); assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.SIMPLE_WITH_FC, false, 10); assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.EARLY_STOP, false, 10); assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, false, 8); }
From source file:au.edu.unimelb.csse.join.JoinFunctionalTest.java
License:Apache License
public void testFilterjoin() throws Exception { String sent = "(NP" + "(NP" + "(DT The)" + "(NN year))" + "(NP" + "(NP(CD 1956))" + "(PP" + "(IN in)" + "(NP(JJ rugby)(NN union))" + ")" + ")" + "(. .)" + ")"; Analyzer analyser = new FastStringAnalyser(); RAMDirectory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED); Document d = new Document(); d.add(new Field("sent", sent, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS)); writer.addDocument(d); writer.close();//from ww w .j a v a2 s . co m IndexSearcher searcher = new IndexSearcher(dir); boolean[] lookaheadOptions = new boolean[] { false, true }; for (TermJoinType type : TermJoinType.values()) { for (boolean lookahead : lookaheadOptions) { QueryBuilder builder = new QueryBuilder("//PP[/IN AND /NP]"); TreebankQuery query = builder.parse(type, lookahead); SimpleHitCollector hitCollector = new SimpleHitCollector(10); searcher.search(query, hitCollector); assertEquals(1, hitCollector.totalHits); } } QueryBuilder builder = new QueryBuilder("//PP[/IN AND /NP/JJ/rugby]"); TreebankQuery query = builder.parse(TermJoinType.SIMPLE, true); SimpleHitCollector hitCollector = new SimpleHitCollector(10); searcher.search(query, hitCollector); assertEquals(1, hitCollector.totalHits); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Creates the temporary index that provides a lookup of checklist bank id to * GUID/*w ww .j a v a2 s.co m*/ */ private IndexSearcher createTmpGuidIndex(String cbExportFile) throws Exception { System.out.println("Starting to create the tmp guid index..."); IndexWriter iw = createIndexWriter(new File("/data/tmp/guid"), new KeywordAnalyzer(), true); au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader( new FileReader(cbExportFile), '\t', '"', '/', 1); for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) { Document doc = new Document(); String id = values[POS_ID]; String guid = values[POS_LSID]; doc.add(new StringField("id", id, Store.YES)); if (StringUtils.isEmpty(id)) guid = id; doc.add(new StoredField("guid", guid)); iw.addDocument(doc); } System.out.println("Finished writing the tmp guid index..."); iw.commit(); iw.forceMerge(1); iw.close(); //As of lucene 4.0 all IndexReaders are read only return new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("/data/tmp/guid")))); }
From source file:au.org.ala.names.search.ALANameIndexer.java
License:Open Source License
/** * Adds the extra ALA concepts from the legislated lists that are missing from the NSL. * * @param iw// www . j a v a 2 s .c o m * @param file * @throws Exception */ private void addExtraALAConcept(IndexWriter iw, String file) throws Exception { if (new File(file).exists()) { au.com.bytecode.opencsv.CSVReader reader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file), ',', '"', '\\', 1); for (String[] values = reader.readNext(); values != null; values = reader.readNext()) { String lsid = values[0]; String scientificName = values[1]; String authority = values[2]; Document doc = createALAIndexDocument(scientificName, "-1", lsid, authority, null); iw.addDocument(doc); } } }