Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:at.ac.univie.mminf.luceneSKOS.analysis.engine.jena.SKOSEngineImpl.java

License:Apache License

/**
 * Creates the synonym index//ww  w. j  av  a 2s.c  o  m
 *
 * @throws IOException
 */
private void indexSKOSModel() throws IOException {
    IndexWriterConfig cfg = new IndexWriterConfig(analyzer);
    IndexWriter writer = new IndexWriter(indexDir, cfg);
    writer.getConfig().setRAMBufferSizeMB(48);
    /* iterate SKOS concepts, create Lucene docs and add them to the index */
    ResIterator concept_iter = skosModel.listResourcesWithProperty(RDF.type, SKOS.Concept);
    while (concept_iter.hasNext()) {
        Resource skos_concept = concept_iter.next();
        Document concept_doc = createDocumentsFromConcept(skos_concept);
        writer.addDocument(concept_doc);
    }
    writer.close();
}

From source file:at.ac.univie.mminf.luceneSKOS.skos.impl.SKOSEngineImpl.java

License:Apache License

/**
 * Creates the synonym index/*  w w  w .j  av a2  s .  c o m*/
 * 
 * @throws IOException
 */
private void indexSKOSModel() throws IOException {
    IndexWriterConfig cfg = new IndexWriterConfig(matchVersion, analyzer);
    IndexWriter writer = new IndexWriter(indexDir, cfg);
    writer.getConfig().setRAMBufferSizeMB(48);

    /* iterate SKOS concepts, create Lucene docs and add them to the index */
    ResIterator concept_iter = skosModel.listResourcesWithProperty(RDF.type, SKOS.Concept);
    while (concept_iter.hasNext()) {
        Resource skos_concept = concept_iter.next();

        Document concept_doc = createDocumentsFromConcept(skos_concept);

        // System.out.println("Adding document to index " + concept_doc);

        writer.addDocument(concept_doc);
    }

    writer.close();
}

From source file:at.lux.fotoretrieval.retrievalengines.LucenePathIndexRetrievalEngine.java

License:Open Source License

public void indexFilesSemantically(String pathToIndex, StatusBar statusBar) {
    if (statusBar != null)
        statusBar.setStatus("Creating index from semantic annotations");

    SAXBuilder builder = new SAXBuilder();
    XMLOutputter outputter = new XMLOutputter(
            Format.getRawFormat().setIndent("").setLineSeparator("").setExpandEmptyElements(false));

    try {//w  ww .  j  a v a2s . c  om
        String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true);
        if (descriptions == null)
            return;
        float numAllDocsPercent = (float) descriptions.length / 100f;
        DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
        df.setMaximumFractionDigits(1);

        // Preparing objects for the index:
        HashMap<String, ElementEntry> elementMap = new HashMap<String, ElementEntry>(descriptions.length);
        HashMap<Element, LinkedList<String>> element2document = new HashMap<Element, LinkedList<String>>(
                descriptions.length);

        // in the first run we identify the semantic objects that we want to index and build
        // a table were we can relate them to the documents (identified by their path)
        for (int i = 0; i < descriptions.length; i++) {
            try {
                Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                for (Object aL : l) {
                    Element semanticElement = (Element) aL;
                    String xmlString = outputter.outputString(semanticElement).trim()
                            .replaceAll("id=\"id_[0-9]*\"", "");
                    // check if element is already there, indicator is its string representation.
                    if (!elementMap.keySet().contains(xmlString)) {
                        // its not here, put it in.
                        elementMap.put(xmlString, new ElementEntry(semanticElement, elementMap.size()));
                        //                            System.out.println(xmlString);
                    }
                    // now get the unified element
                    semanticElement = elementMap.get(xmlString).semanticElement;
                    // and check if there is an entry in the table for where to find the element
                    if (!element2document.keySet().contains(semanticElement)) {
                        element2document.put(semanticElement, new LinkedList<String>());
                    }
                    // and add found document if not already there:
                    List documentList = element2document.get(semanticElement);
                    if (!documentList.contains(descriptions[i]))
                        documentList.add(descriptions[i]);
                }
                if (statusBar != null)
                    statusBar.setStatus(
                            "Parsing documents for nodes: " + df.format((float) i / numAllDocsPercent));
            } catch (JDOMException e1) {
                System.err.println("Exception in document #" + i + ": " + e1.getMessage());
            } catch (IOException e1) {
                e1.printStackTrace();
            }
        }
        // read stats:
        // System.out.println("Got " + countOverallElements + " Elements in " + descriptions.length + " descriptions, " + elementMap.size() + " elements are pairwise different.");

        // Now we can add the nodes to a lucene index:
        // fields: label, id, type, files (separated by '|'), xml, all
        // -------------------------------------------

        // opening the index for writing:
        boolean createFlag = true;
        String indexDir = parseSemanticIndexDirectory(pathToIndex);
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);

        if (statusBar != null)
            statusBar.setStatus("Creating index for " + element2document.size() + " different available nodes");

        // iterating through nodes and storing them:
        for (Element semElement : element2document.keySet()) {
            // needed for later XPath :( otherwise everthing in the whole document is retrieved.

            String fileList = getFileListFromNode(element2document.get(semElement));
            Document idxDocument = new Document();
            // adding the file itself ...
            idxDocument.add(new Field("files", fileList, Field.Store.YES, Field.Index.NO));

            //                System.out.println(((Element) o).getTextTrim());

            //                StringBuilder all = new StringBuilder(255);
            //                 adding the label
            //                addToDocument(idxDocument, semElement, "//Label/Name", "label", all);
            String elementLabel = semElement.getChild("Label", semElement.getNamespace())
                    .getChildTextTrim("Name", semElement.getNamespace());
            Field labelField = new Field("label", elementLabel, Field.Store.YES, Field.Index.TOKENIZED);
            labelField.setBoost(1.2f);
            idxDocument.add(labelField);

            // adding the type:
            String elementType = semElement.getAttribute("type", xsi).getValue().trim();
            idxDocument.add(new Field("type", elementType, Field.Store.YES, Field.Index.NO));
            // adding the XML contents:
            String xmlString = outputter.outputString(semElement);
            idxDocument.add(new Field("xml", xmlString, Field.Store.YES, Field.Index.NO));
            // adding the id:
            idxDocument.add(
                    new Field("id", elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id + "",
                            Field.Store.YES, Field.Index.UN_TOKENIZED));
            // TODO: split the indexing for objects based on type:
            // adding all, unstored for retrieval only
            if (elementType.equals("AgentObjectType")) {
                createIndexDocumentFromSemanticAgent(semElement, idxDocument);
            } else if (elementType.equals("EventType")) {
                createIndexDocumentFromSemanticElement(semElement, idxDocument);
            } else if (elementType.equals("SemanticPlaceType")) {
                createIndexDocumentFromSemanticElement(semElement, idxDocument);
            } else if (elementType.equals("SemanticTimeType")) {
                createIndexDocumentFromSemanticElement(semElement, idxDocument);
            } else {
                createIndexDocumentFromSemanticElement(semElement, idxDocument);
            }

            writer.addDocument(idxDocument);

        }
        // now optimize and close the index:
        // todo: open index for appending and/or updating
        writer.optimize();
        writer.close();

        // Now we can create the powerset for each existing graph
        // (based on sorted node ids) and store
        // all resulting graphs within an index.
        // ----------------------------------------------------------
        if (statusBar != null)
            statusBar.setStatus("Creating and merging of available graphs");
        HashMap<Graph, HashSet<String>> graph2document = new HashMap<Graph, HashSet<String>>(
                descriptions.length);
        for (int i = 0; i < descriptions.length; i++)
            try {
                Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                HashMap<String, Integer> docID2overallID = new HashMap<String, Integer>(l.size());
                LinkedList<Relation> relations = new LinkedList<Relation>();
                LinkedList<Integer> nodes = new LinkedList<Integer>();
                for (Object aL : l) {
                    Element semanticElement = (Element) aL;
                    String xmlString = outputter.outputString(semanticElement);
                    int id = elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id;
                    String docID = semanticElement.getAttribute("id").getValue();
                    docID2overallID.put(docID, id);
                    nodes.add(id);
                }
                // get all relations with global ids and eliminate inverse relations
                l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
                for (Object aL1 : l) {
                    Element relation = (Element) aL1;
                    int source = docID2overallID.get(relation.getAttribute("source").getValue().substring(1));
                    int target = docID2overallID.get(relation.getAttribute("target").getValue().substring(1));
                    String type = relation.getAttribute("type").getValue();
                    type = type.substring(type.lastIndexOf(':') + 1);
                    Relation r = eliminateInverse(new Relation(source, target, type));
                    relations.add(r);
                }

                // now create a graph object
                Collections.sort(nodes);
                Collections.sort(relations);
                LinkedList<Node> nodeList = new LinkedList<Node>();
                for (Integer node : nodes) {
                    nodeList.add(new Node(node));
                }
                Graph g = new Graph(nodeList, relations);
                HashSet<String> docs = new HashSet<String>(1);
                docs.add(descriptions[i]);
                graph2document.put(g, docs);

            } catch (JDOMException e1) {
                System.err.println(new StringBuilder().append("Exception in document #").append(i).append(": ")
                        .append(e1.getMessage()).toString());
            }

        HashMap<String, Graph> str2graph = new HashMap<String, Graph>(graph2document.size() / 2);
        HashMap<Graph, HashSet<String>> g2d = new HashMap<Graph, HashSet<String>>(descriptions.length);

        /*
        For now we reduce the number of graphs by identifiying and merging duplicates and
        remove redundant entries:
        */
        for (Graph g : graph2document.keySet()) {
            if (str2graph.containsKey(g.toString())) {
                g2d.get(str2graph.get(g.toString())).addAll(graph2document.get(g));
            } else {
                str2graph.put(g.toString(), g);
                g2d.put(g, graph2document.get(g));
            }
        }
        graph2document = g2d;
        System.out.println(graph2document.size() + " non trivial different graphs were found");
        // now put all the available graphs into an index:
        // -----------------------------------------------

        // for now we will store a simple text file:
        if (statusBar != null)
            statusBar.setStatus("Saving index of paths");

        boolean createPathIndexFlag = true;
        String pathIndexDir = parsePathIndexDirectory(pathToIndex);
        IndexWriter pathIndexWriter = new IndexWriter(pathIndexDir, new GraphAnalyzer(), createPathIndexFlag);

        for (Graph graph : graph2document.keySet()) {
            HashSet<String> files = graph2document.get(graph);
            Document idxDocument = new Document();
            // adding the file itself ...
            for (String s : files) {
                idxDocument.add(new Field("file", s, Field.Store.YES, Field.Index.NO));
            }
            // adding the graph ...
            idxDocument.add(new Field("graph", graph.toString(), Field.Store.YES, Field.Index.TOKENIZED));
            //                idxDocument.add(Field.UnIndexed("graph", graph.toString()));
            // adding the paths
            StringBuilder sb = new StringBuilder(256);
            sb.append(graph.toString());
            List<Path> pathList = (new LabeledGraph(graph)).get2Paths();
            if (!pathList.isEmpty())
                sb.append(' ');
            for (Iterator<Path> iterator1 = pathList.iterator(); iterator1.hasNext();) {
                Path path = iterator1.next();
                sb.append(path.toString());
                if (iterator1.hasNext())
                    sb.append(' ');
            }
            idxDocument.add(new Field("paths", sb.toString(), Field.Store.YES, Field.Index.TOKENIZED));
            pathIndexWriter.addDocument(idxDocument);
        }
        // now optimize and close the index:
        pathIndexWriter.optimize();
        pathIndexWriter.close();

    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:at.lux.fotoretrieval.retrievalengines.LuceneRetrievalEngine.java

License:Open Source License

/**
 * In general we take the base path for our search for the pathToIndex parameter.
 * we then add the directory "index" and create it there.
 *
 * @param pathToIndex/*from  w w w.  java2s . co  m*/
 * @param statusBar
 */
public void indexFiles(String pathToIndex, StatusBar statusBar) {
    // parsing and eventually creating the directory for the index ...
    String indexDir = parseFulltextIndexDirectory(pathToIndex);

    Analyzer analyzer = new StandardAnalyzer();
    boolean createFlag = true;
    SAXBuilder builder = new SAXBuilder();
    String prefix = "Creating fulltext index: ";
    try {
        IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);
        String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true);
        if (descriptions == null)
            return;
        float numAllDocsPercent = (float) descriptions.length / 100f;
        DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
        df.setMaximumFractionDigits(1);

        for (int i = 0; i < descriptions.length; i++) {
            try {
                Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                Document idxDocument = new Document();
                // adding the file itself ...
                idxDocument.add(new Field("file", descriptions[i], Field.Store.YES, Field.Index.NO));
                // adding all given names
                StringBuilder all = new StringBuilder(255);

                List l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
                //                    System.out.println("NumberOfRelations: " + l.size());

                addToDocument(idxDocument, e, "//Agent/Name/GivenName", "GivenName", all);
                addToDocument(idxDocument, e, "//Agent/Name/FamilyName", "FamilyName", all);
                addToDocument(idxDocument, e, "//Label/Name", "Label", all);
                addToDocument(idxDocument, e, "//FreeTextAnnotation", "FreeTextAnnotation", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/Who/Name", "Who", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/Where/Name", "Where", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/How/Name", "How", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/Why/Name", "Why", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/When/Name", "When", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/WhatObject/Name", "WhatObjects", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/WhatAction/Name", "WhatAction", all);

                idxDocument.add(new Field("all", all.toString(), Field.Store.NO, Field.Index.TOKENIZED));

                writer.addDocument(idxDocument);

                if (statusBar != null) {
                    StringBuilder status = new StringBuilder(13).append(prefix);
                    status.append(df.format(((float) i) / numAllDocsPercent));
                    status.append('%');
                    statusBar.setStatus(status.toString());
                }

            } catch (Exception e1) {
                System.err.println("Error with file " + descriptions[i] + " (" + e1.getMessage() + ")");
            }
        }
        writer.optimize();
        writer.close();
        if (statusBar != null) {
            statusBar.setStatus("Indexing finished");
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:at.lux.fotoretrieval.retrievalengines.LuceneRetrievalEngine.java

License:Open Source License

public void indexFilesSemantically(String pathToIndex, StatusBar statusBar) {
    if (statusBar != null)
        statusBar.setStatus("Creating index from semantic annotations");

    SAXBuilder builder = new SAXBuilder();
    XMLOutputter outputter = new XMLOutputter(
            Format.getRawFormat().setIndent("").setLineSeparator("").setExpandEmptyElements(false));

    try {//from  w  ww. j a v a 2  s .co m
        String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true);
        if (descriptions == null)
            return;
        float numAllDocsPercent = (float) descriptions.length / 100f;
        DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
        df.setMaximumFractionDigits(1);

        // Preparing objects for the index:
        HashMap<String, ElementEntry> elementMap = new HashMap<String, ElementEntry>(descriptions.length);
        HashMap<Element, LinkedList<String>> element2document = new HashMap<Element, LinkedList<String>>(
                descriptions.length);

        // in the first run we identify the semantic objects that we want to index and build
        // a table were we can relate them to the documents (identified by their path)
        for (int i = 0; i < descriptions.length; i++) {
            try {
                Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                    Element semanticElement = (Element) iterator.next();
                    String xmlString = outputter.outputString(semanticElement).trim()
                            .replaceAll("id=\"id_[0-9]*\"", "");
                    // check if element is already there, indicator is its string representation.
                    if (!elementMap.keySet().contains(xmlString)) {
                        // its not here, put it in.
                        elementMap.put(xmlString, new ElementEntry(semanticElement, elementMap.size()));
                        //                            System.out.println(xmlString);
                    }
                    // now get the unified element
                    semanticElement = elementMap.get(xmlString).semanticElement;
                    // and check if there is an entry in the table for where to find the element
                    if (!element2document.keySet().contains(semanticElement)) {
                        element2document.put(semanticElement, new LinkedList<String>());
                    }
                    // and add found document if not already there:
                    List documentList = element2document.get(semanticElement);
                    if (!documentList.contains(descriptions[i]))
                        documentList.add(descriptions[i]);
                }
                if (statusBar != null)
                    statusBar.setStatus(
                            "Parsing documents for nodes: " + df.format((float) i / numAllDocsPercent));
            } catch (JDOMException e1) {
                System.err.println("Exception in document #" + i + ": " + e1.getMessage());
            } catch (IOException e1) {
                e1.printStackTrace();
            }
        }
        // read stats:
        // System.out.println("Got " + countOverallElements + " Elements in " + descriptions.length + " descriptions, " + elementMap.size() + " elements are pairwise different.");

        // Now we can add the nodes to a lucene index:
        // fields: label, id, type, files (separated by '|'), xml, all
        // -------------------------------------------

        // opening the index for writing:
        boolean createFlag = true;
        String indexDir = parseSemanticIndexDirectory(pathToIndex);
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);

        if (statusBar != null)
            statusBar.setStatus("Creating index for " + element2document.size() + " different available nodes");

        // iterating through nodes and storing them:
        for (Iterator<Element> iterator = element2document.keySet().iterator(); iterator.hasNext();) {
            Element semElement = iterator.next();
            // needed for later XPath :( otherwise everthing in the whole document is retrieved.

            String fileList = getFileListFromNode(element2document.get(semElement));
            Document idxDocument = new Document();
            // adding the file itself ...
            idxDocument.add(new Field("files", fileList, Field.Store.YES, Field.Index.NO));

            //                System.out.println(((Element) o).getTextTrim());

            StringBuilder all = new StringBuilder(255);
            // adding the label
            //                addToDocument(idxDocument, semElement, "//Label/Name", "label", all);
            String elementLabel = semElement.getChild("Label", semElement.getNamespace())
                    .getChildTextTrim("Name", semElement.getNamespace());
            idxDocument.add(new Field("label", elementLabel, Field.Store.YES, Field.Index.TOKENIZED));

            // adding the type:
            String elementType = semElement.getAttribute("type", xsi).getValue().trim();
            idxDocument.add(new Field("type", elementType, Field.Store.YES, Field.Index.NO));
            // adding the XML contents:
            String xmlString = outputter.outputString(semElement);
            idxDocument.add(new Field("xml", xmlString, Field.Store.YES, Field.Index.NO));
            // adding the id:
            idxDocument.add(
                    new Field("id", elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id + "",
                            Field.Store.YES, Field.Index.NO));
            // adding all, unstored for retrieval only
            List l = RetrievalToolkit.xpathQuery(semElement, "*//*", null);
            for (Iterator it3 = l.iterator(); it3.hasNext();) {
                Element e = (Element) it3.next();
                all.append(e.getTextTrim());
                all.append(" ");
            }
            idxDocument.add(new Field("all", all.toString(), Field.Store.NO, Field.Index.TOKENIZED));

            writer.addDocument(idxDocument);

        }
        // now optimize and close the index:
        // todo: open index for appending and/or updating
        writer.optimize();
        writer.close();

        // Now we can create the powerset for each existing graph
        // (based on sorted node ids) and store
        // all resulting graphs within an index.
        // ----------------------------------------------------------
        if (statusBar != null)
            statusBar.setStatus("Creating and merging powersets of available graphs");
        HashMap<Graph, HashSet<String>> graph2document = new HashMap<Graph, HashSet<String>>(
                descriptions.length);
        for (int i = 0; i < descriptions.length; i++) {
            try {
                Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                HashMap<String, Integer> docID2overallID = new HashMap<String, Integer>(l.size());
                LinkedList<Relation> relations = new LinkedList<Relation>();
                LinkedList<Integer> nodes = new LinkedList<Integer>();
                for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                    Element semanticElement = (Element) iterator.next();
                    String xmlString = outputter.outputString(semanticElement);
                    int id = elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id;
                    String docID = semanticElement.getAttribute("id").getValue();
                    docID2overallID.put(docID, id);
                    nodes.add(id);
                }
                // get all relations with global ids and eliminate inverse relations
                l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
                for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                    Element relation = (Element) iterator.next();
                    int source = docID2overallID.get(relation.getAttribute("source").getValue().substring(1));
                    int target = docID2overallID.get(relation.getAttribute("target").getValue().substring(1));
                    String type = relation.getAttribute("type").getValue();
                    type = type.substring(type.lastIndexOf(':') + 1);
                    Relation r = eliminateInverse(new Relation(source, target, type));
                    relations.add(r);
                }

                // now create a graph object
                Collections.sort(nodes);
                Collections.sort(relations);
                LinkedList<Node> nodeList = new LinkedList<Node>();
                for (Iterator<Integer> iterator = nodes.iterator(); iterator.hasNext();) {
                    nodeList.add(new Node(iterator.next()));
                }
                Graph g = new Graph(nodeList, relations);
                //                    List<Graph> powerSet = new LinkedList<Graph>();
                //                    powerSet.add(g);
                HashSet<String> docs = new HashSet<String>(1);
                docs.add(descriptions[i]);
                graph2document.put(g, docs);
                /*
                        
                                    // add all these subgraphs and the reference to the document to
                                    // a data structure:
                                    for (Iterator<Graph> iterator = powerSet.iterator(); iterator.hasNext();) {
                Graph graph = iterator.next();
                //                        List<Graph> relationsPowerSet = graph.getPowerSetOfRelations();
                //                        for (Iterator<Graph> iterator1 = relationsPowerSet.iterator(); iterator1.hasNext();) {
                //                            Graph graph1 = iterator1.next();
                //                        }
                // add graph if not trivial:
                if (graph.getNodes().size() > 1) {
                    // containsKey for Graph does not match my needs -
                    // different graph objects reference the same graph!
                    if (string2graph.containsKey(graph.toString())) {
                        graph = string2graph.get(graph.toString());
                        graph2document.get(graph).add(descriptions[i]);
                    } else {
                        HashSet<String> docs = new HashSet<String>(1);
                        docs.add(descriptions[i]);
                        graph2document.put(graph, docs);
                    }
                }
                                    }
                */
            } catch (JDOMException e1) {
                System.err.println("Exception in document #" + i + ": " + e1.getMessage());
            }
        }

        HashMap<String, Graph> str2graph = new HashMap<String, Graph>(graph2document.size() / 2);
        HashMap<Graph, HashSet<String>> g2d = new HashMap<Graph, HashSet<String>>(descriptions.length);

        /*
        For now we reduce the number of graphs by identifiying and merging duplicates and
        remove redundant entries:
        */
        for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
            Graph g = iterator.next();
            if (str2graph.containsKey(g.toString())) {
                g2d.get(str2graph.get(g.toString())).addAll(graph2document.get(g));
            } else {
                str2graph.put(g.toString(), g);
                g2d.put(g, graph2document.get(g));
            }
        }
        graph2document = g2d;
        System.out.println(graph2document.size() + " non trivial different graphs were found");
        // now put all the available graphs into an index:
        // -----------------------------------------------
        // todo: create real fast storable index of subgraphs instead of file :-) possible candidate a trie

        // for now we will store a simple text file:
        if (statusBar != null)
            statusBar.setStatus("Storing powersets of available graphs as file");
        String indexFile;
        if (!pathToIndex.endsWith(File.separator)) {
            indexFile = pathToIndex + File.separator + "idx_graphs.list";
        } else {
            indexFile = pathToIndex + "idx_graphs.list";
        }
        File f = new File(indexFile);
        BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(f, false))));
        for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
            Graph g = iterator.next();
            bw.write(g.toString());
            for (Iterator<String> iterator1 = graph2document.get(g).iterator(); iterator1.hasNext();) {
                String s = iterator1.next();
                bw.write("|" + s);
            }
            bw.write("\n");
        }
        bw.close();
    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:au.edu.unimelb.csse.analyser.NodeCacheTest.java

License:Apache License

public void testReusesNodesWhileIndexing() throws Exception {
    String[] sents = new String[] { "(A(B C)(D(E F)))", "(A(B(C D)))", "(A(B C)(D(E(F(G H)))))", "(A(B C))" };
    String[] jsonSents = new String[sents.length];
    String2NodesParser parser = new String2NodesParser();
    assertEquals(0, NodeCache.cacheSize());
    int[] expectedCounts = new int[] { 0, 2, 0, 5 };
    //First sent: 6 nodes are used but they are not returned until the next sentence is read. 
    //Hence the cache still returns a size of 0
    //Second sent: 6 nodes are returned back but the new sentence contains 4 nodes
    //6 - 4 = 2/*from   www .  j  a va 2 s .  co m*/
    //Third sent: 4 nodes are returned back but the new sentence contains 8 nodes
    //size shows 0 again
    //Fourth sent: 8 nodes are returned back but the new sentence contains 3 nodes
    //8 - 3 = 5

    for (int i = 0; i < sents.length; i++) {
        jsonSents[i] = parser.parse(sents[i]).asJSONString();
        assertEquals(expectedCounts[i], NodeCache.cacheSize());
    }
    Analyzer analyser = new NodeTreebankAnalyser(false);
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED);

    Document d = new Document();
    d.add(new Field("sent", jsonSents[0], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);
    //No change to Node cache
    assertEquals(5, NodeCache.cacheSize());

    d = new Document();
    d.add(new Field("sent", jsonSents[1], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);
    //No change to Node cache
    assertEquals(5, NodeCache.cacheSize());

    d = new Document();
    d.add(new Field("sent", jsonSents[2], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);
    //No change to Node cache
    assertEquals(5, NodeCache.cacheSize());

    d = new Document();
    d.add(new Field("sent", jsonSents[3], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);
    //No change to Node cache
    assertEquals(5, NodeCache.cacheSize());

}

From source file:au.edu.unimelb.csse.join.JoinFunctionalTest.java

License:Apache License

/**
 * This test is actually commented out.. to run the test.. match counting has to be enabled in JoinLogic
 * @throws Exception/*from  w w  w.  j ava  2 s. com*/
 */
public void testNumberOfCallsToMatch() throws Exception {
    String sent = "(NP" + "(NP" + "(DT The)" + "(NN year))" + "(NP" + "(NP(CD 1956))" + "(PP" + "(IN in)"
            + "(NP(JJ rugby)(NN union))" + ")" + ")" + "(. .)" + ")";
    Analyzer analyser = new FastStringAnalyser();
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED);

    Document d = new Document();
    d.add(new Field("sent", sent, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);

    writer.close();

    IndexSearcher searcher = new IndexSearcher(dir);
    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE, false, 6);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE_WITH_FC, false, 1);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP, false, 2);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, false, 1);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE, true, 6);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE_WITH_FC, true, 5);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP, true, 6);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, true, 5);

    assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.SIMPLE, false, 23);

    assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.SIMPLE_WITH_FC, false, 10);

    assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.EARLY_STOP, false, 10);

    assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, false, 8);

}

From source file:au.edu.unimelb.csse.join.JoinFunctionalTest.java

License:Apache License

public void testFilterjoin() throws Exception {
    String sent = "(NP" + "(NP" + "(DT The)" + "(NN year))" + "(NP" + "(NP(CD 1956))" + "(PP" + "(IN in)"
            + "(NP(JJ rugby)(NN union))" + ")" + ")" + "(. .)" + ")";
    Analyzer analyser = new FastStringAnalyser();
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED);

    Document d = new Document();
    d.add(new Field("sent", sent, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);

    writer.close();//from  ww  w .j a  v  a2  s .  co  m

    IndexSearcher searcher = new IndexSearcher(dir);

    boolean[] lookaheadOptions = new boolean[] { false, true };
    for (TermJoinType type : TermJoinType.values()) {
        for (boolean lookahead : lookaheadOptions) {
            QueryBuilder builder = new QueryBuilder("//PP[/IN AND /NP]");
            TreebankQuery query = builder.parse(type, lookahead);
            SimpleHitCollector hitCollector = new SimpleHitCollector(10);
            searcher.search(query, hitCollector);
            assertEquals(1, hitCollector.totalHits);
        }
    }

    QueryBuilder builder = new QueryBuilder("//PP[/IN AND /NP/JJ/rugby]");
    TreebankQuery query = builder.parse(TermJoinType.SIMPLE, true);
    SimpleHitCollector hitCollector = new SimpleHitCollector(10);
    searcher.search(query, hitCollector);
    assertEquals(1, hitCollector.totalHits);

}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates the temporary index that provides a lookup of checklist bank id to
 * GUID/*w ww .j a v a2  s.co  m*/
 */
private IndexSearcher createTmpGuidIndex(String cbExportFile) throws Exception {
    System.out.println("Starting to create the tmp guid index...");
    IndexWriter iw = createIndexWriter(new File("/data/tmp/guid"), new KeywordAnalyzer(), true);
    au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader(
            new FileReader(cbExportFile), '\t', '"', '/', 1);
    for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) {
        Document doc = new Document();
        String id = values[POS_ID];
        String guid = values[POS_LSID];
        doc.add(new StringField("id", id, Store.YES));
        if (StringUtils.isEmpty(id))
            guid = id;

        doc.add(new StoredField("guid", guid));
        iw.addDocument(doc);
    }
    System.out.println("Finished writing the tmp guid index...");
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    //As of lucene 4.0 all IndexReaders are read only
    return new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("/data/tmp/guid"))));
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Adds the extra ALA concepts from the legislated lists that are missing from the NSL.
 *
 * @param iw// www  . j  a v a  2  s  .c o  m
 * @param file
 * @throws Exception
 */
private void addExtraALAConcept(IndexWriter iw, String file) throws Exception {
    if (new File(file).exists()) {
        au.com.bytecode.opencsv.CSVReader reader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file),
                ',', '"', '\\', 1);
        for (String[] values = reader.readNext(); values != null; values = reader.readNext()) {
            String lsid = values[0];
            String scientificName = values[1];
            String authority = values[2];
            Document doc = createALAIndexDocument(scientificName, "-1", lsid, authority, null);
            iw.addDocument(doc);
        }
    }
}