Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:at.ac.univie.mminf.luceneSKOS.analysis.engine.jena.SKOSEngineImpl.java

License:Apache License

/**
 * Creates the synonym index//ww  w. j  av  a 2s.c  o  m
 *
 * @throws IOException
 */
private void indexSKOSModel() throws IOException {
    IndexWriterConfig cfg = new IndexWriterConfig(analyzer);
    IndexWriter writer = new IndexWriter(indexDir, cfg);
    writer.getConfig().setRAMBufferSizeMB(48);
    /* iterate SKOS concepts, create Lucene docs and add them to the index */
    ResIterator concept_iter = skosModel.listResourcesWithProperty(RDF.type, SKOS.Concept);
    while (concept_iter.hasNext()) {
        Resource skos_concept = concept_iter.next();
        Document concept_doc = createDocumentsFromConcept(skos_concept);
        writer.addDocument(concept_doc);
    }
    writer.close();
}

From source file:at.ac.univie.mminf.luceneSKOS.skos.impl.SKOSEngineImpl.java

License:Apache License

/**
 * Creates the synonym index/*  w w  w .j  av a2  s .  c o m*/
 * 
 * @throws IOException
 */
private void indexSKOSModel() throws IOException {
    IndexWriterConfig cfg = new IndexWriterConfig(matchVersion, analyzer);
    IndexWriter writer = new IndexWriter(indexDir, cfg);
    writer.getConfig().setRAMBufferSizeMB(48);

    /* iterate SKOS concepts, create Lucene docs and add them to the index */
    ResIterator concept_iter = skosModel.listResourcesWithProperty(RDF.type, SKOS.Concept);
    while (concept_iter.hasNext()) {
        Resource skos_concept = concept_iter.next();

        Document concept_doc = createDocumentsFromConcept(skos_concept);

        // System.out.println("Adding document to index " + concept_doc);

        writer.addDocument(concept_doc);
    }

    writer.close();
}

From source file:at.lux.fotoretrieval.retrievalengines.LucenePathIndexRetrievalEngine.java

License:Open Source License

public void indexFilesSemantically(String pathToIndex, StatusBar statusBar) {
    if (statusBar != null)
        statusBar.setStatus("Creating index from semantic annotations");

    SAXBuilder builder = new SAXBuilder();
    XMLOutputter outputter = new XMLOutputter(
            Format.getRawFormat().setIndent("").setLineSeparator("").setExpandEmptyElements(false));

    try {//w  ww .  j  a v a2s . c  om
        String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true);
        if (descriptions == null)
            return;
        float numAllDocsPercent = (float) descriptions.length / 100f;
        DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
        df.setMaximumFractionDigits(1);

        // Preparing objects for the index:
        HashMap<String, ElementEntry> elementMap = new HashMap<String, ElementEntry>(descriptions.length);
        HashMap<Element, LinkedList<String>> element2document = new HashMap<Element, LinkedList<String>>(
                descriptions.length);

        // in the first run we identify the semantic objects that we want to index and build
        // a table were we can relate them to the documents (identified by their path)
        for (int i = 0; i < descriptions.length; i++) {
            try {
                Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                for (Object aL : l) {
                    Element semanticElement = (Element) aL;
                    String xmlString = outputter.outputString(semanticElement).trim()
                            .replaceAll("id=\"id_[0-9]*\"", "");
                    // check if element is already there, indicator is its string representation.
                    if (!elementMap.keySet().contains(xmlString)) {
                        // its not here, put it in.
                        elementMap.put(xmlString, new ElementEntry(semanticElement, elementMap.size()));
                        //                            System.out.println(xmlString);
                    }
                    // now get the unified element
                    semanticElement = elementMap.get(xmlString).semanticElement;
                    // and check if there is an entry in the table for where to find the element
                    if (!element2document.keySet().contains(semanticElement)) {
                        element2document.put(semanticElement, new LinkedList<String>());
                    }
                    // and add found document if not already there:
                    List documentList = element2document.get(semanticElement);
                    if (!documentList.contains(descriptions[i]))
                        documentList.add(descriptions[i]);
                }
                if (statusBar != null)
                    statusBar.setStatus(
                            "Parsing documents for nodes: " + df.format((float) i / numAllDocsPercent));
            } catch (JDOMException e1) {
                System.err.println("Exception in document #" + i + ": " + e1.getMessage());
            } catch (IOException e1) {
                e1.printStackTrace();
            }
        }
        // read stats:
        // System.out.println("Got " + countOverallElements + " Elements in " + descriptions.length + " descriptions, " + elementMap.size() + " elements are pairwise different.");

        // Now we can add the nodes to a lucene index:
        // fields: label, id, type, files (separated by '|'), xml, all
        // -------------------------------------------

        // opening the index for writing:
        boolean createFlag = true;
        String indexDir = parseSemanticIndexDirectory(pathToIndex);
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);

        if (statusBar != null)
            statusBar.setStatus("Creating index for " + element2document.size() + " different available nodes");

        // iterating through nodes and storing them:
        for (Element semElement : element2document.keySet()) {
            // needed for later XPath :( otherwise everthing in the whole document is retrieved.

            String fileList = getFileListFromNode(element2document.get(semElement));
            Document idxDocument = new Document();
            // adding the file itself ...
            idxDocument.add(new Field("files", fileList, Field.Store.YES, Field.Index.NO));

            //                System.out.println(((Element) o).getTextTrim());

            //                StringBuilder all = new StringBuilder(255);
            //                 adding the label
            //                addToDocument(idxDocument, semElement, "//Label/Name", "label", all);
            String elementLabel = semElement.getChild("Label", semElement.getNamespace())
                    .getChildTextTrim("Name", semElement.getNamespace());
            Field labelField = new Field("label", elementLabel, Field.Store.YES, Field.Index.TOKENIZED);
            labelField.setBoost(1.2f);
            idxDocument.add(labelField);

            // adding the type:
            String elementType = semElement.getAttribute("type", xsi).getValue().trim();
            idxDocument.add(new Field("type", elementType, Field.Store.YES, Field.Index.NO));
            // adding the XML contents:
            String xmlString = outputter.outputString(semElement);
            idxDocument.add(new Field("xml", xmlString, Field.Store.YES, Field.Index.NO));
            // adding the id:
            idxDocument.add(
                    new Field("id", elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id + "",
                            Field.Store.YES, Field.Index.UN_TOKENIZED));
            // TODO: split the indexing for objects based on type:
            // adding all, unstored for retrieval only
            if (elementType.equals("AgentObjectType")) {
                createIndexDocumentFromSemanticAgent(semElement, idxDocument);
            } else if (elementType.equals("EventType")) {
                createIndexDocumentFromSemanticElement(semElement, idxDocument);
            } else if (elementType.equals("SemanticPlaceType")) {
                createIndexDocumentFromSemanticElement(semElement, idxDocument);
            } else if (elementType.equals("SemanticTimeType")) {
                createIndexDocumentFromSemanticElement(semElement, idxDocument);
            } else {
                createIndexDocumentFromSemanticElement(semElement, idxDocument);
            }

            writer.addDocument(idxDocument);

        }
        // now optimize and close the index:
        // todo: open index for appending and/or updating
        writer.optimize();
        writer.close();

        // Now we can create the powerset for each existing graph
        // (based on sorted node ids) and store
        // all resulting graphs within an index.
        // ----------------------------------------------------------
        if (statusBar != null)
            statusBar.setStatus("Creating and merging of available graphs");
        HashMap<Graph, HashSet<String>> graph2document = new HashMap<Graph, HashSet<String>>(
                descriptions.length);
        for (int i = 0; i < descriptions.length; i++)
            try {
                Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                HashMap<String, Integer> docID2overallID = new HashMap<String, Integer>(l.size());
                LinkedList<Relation> relations = new LinkedList<Relation>();
                LinkedList<Integer> nodes = new LinkedList<Integer>();
                for (Object aL : l) {
                    Element semanticElement = (Element) aL;
                    String xmlString = outputter.outputString(semanticElement);
                    int id = elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id;
                    String docID = semanticElement.getAttribute("id").getValue();
                    docID2overallID.put(docID, id);
                    nodes.add(id);
                }
                // get all relations with global ids and eliminate inverse relations
                l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
                for (Object aL1 : l) {
                    Element relation = (Element) aL1;
                    int source = docID2overallID.get(relation.getAttribute("source").getValue().substring(1));
                    int target = docID2overallID.get(relation.getAttribute("target").getValue().substring(1));
                    String type = relation.getAttribute("type").getValue();
                    type = type.substring(type.lastIndexOf(':') + 1);
                    Relation r = eliminateInverse(new Relation(source, target, type));
                    relations.add(r);
                }

                // now create a graph object
                Collections.sort(nodes);
                Collections.sort(relations);
                LinkedList<Node> nodeList = new LinkedList<Node>();
                for (Integer node : nodes) {
                    nodeList.add(new Node(node));
                }
                Graph g = new Graph(nodeList, relations);
                HashSet<String> docs = new HashSet<String>(1);
                docs.add(descriptions[i]);
                graph2document.put(g, docs);

            } catch (JDOMException e1) {
                System.err.println(new StringBuilder().append("Exception in document #").append(i).append(": ")
                        .append(e1.getMessage()).toString());
            }

        HashMap<String, Graph> str2graph = new HashMap<String, Graph>(graph2document.size() / 2);
        HashMap<Graph, HashSet<String>> g2d = new HashMap<Graph, HashSet<String>>(descriptions.length);

        /*
        For now we reduce the number of graphs by identifiying and merging duplicates and
        remove redundant entries:
        */
        for (Graph g : graph2document.keySet()) {
            if (str2graph.containsKey(g.toString())) {
                g2d.get(str2graph.get(g.toString())).addAll(graph2document.get(g));
            } else {
                str2graph.put(g.toString(), g);
                g2d.put(g, graph2document.get(g));
            }
        }
        graph2document = g2d;
        System.out.println(graph2document.size() + " non trivial different graphs were found");
        // now put all the available graphs into an index:
        // -----------------------------------------------

        // for now we will store a simple text file:
        if (statusBar != null)
            statusBar.setStatus("Saving index of paths");

        boolean createPathIndexFlag = true;
        String pathIndexDir = parsePathIndexDirectory(pathToIndex);
        IndexWriter pathIndexWriter = new IndexWriter(pathIndexDir, new GraphAnalyzer(), createPathIndexFlag);

        for (Graph graph : graph2document.keySet()) {
            HashSet<String> files = graph2document.get(graph);
            Document idxDocument = new Document();
            // adding the file itself ...
            for (String s : files) {
                idxDocument.add(new Field("file", s, Field.Store.YES, Field.Index.NO));
            }
            // adding the graph ...
            idxDocument.add(new Field("graph", graph.toString(), Field.Store.YES, Field.Index.TOKENIZED));
            //                idxDocument.add(Field.UnIndexed("graph", graph.toString()));
            // adding the paths
            StringBuilder sb = new StringBuilder(256);
            sb.append(graph.toString());
            List<Path> pathList = (new LabeledGraph(graph)).get2Paths();
            if (!pathList.isEmpty())
                sb.append(' ');
            for (Iterator<Path> iterator1 = pathList.iterator(); iterator1.hasNext();) {
                Path path = iterator1.next();
                sb.append(path.toString());
                if (iterator1.hasNext())
                    sb.append(' ');
            }
            idxDocument.add(new Field("paths", sb.toString(), Field.Store.YES, Field.Index.TOKENIZED));
            pathIndexWriter.addDocument(idxDocument);
        }
        // now optimize and close the index:
        pathIndexWriter.optimize();
        pathIndexWriter.close();

    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:at.lux.fotoretrieval.retrievalengines.LuceneRetrievalEngine.java

License:Open Source License

/**
 * In general we take the base path for our search for the pathToIndex parameter.
 * we then add the directory "index" and create it there.
 *
 * @param pathToIndex/*from  w w w.  java2s . co  m*/
 * @param statusBar
 */
public void indexFiles(String pathToIndex, StatusBar statusBar) {
    // parsing and eventually creating the directory for the index ...
    String indexDir = parseFulltextIndexDirectory(pathToIndex);

    Analyzer analyzer = new StandardAnalyzer();
    boolean createFlag = true;
    SAXBuilder builder = new SAXBuilder();
    String prefix = "Creating fulltext index: ";
    try {
        IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);
        String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true);
        if (descriptions == null)
            return;
        float numAllDocsPercent = (float) descriptions.length / 100f;
        DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
        df.setMaximumFractionDigits(1);

        for (int i = 0; i < descriptions.length; i++) {
            try {
                Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                Document idxDocument = new Document();
                // adding the file itself ...
                idxDocument.add(new Field("file", descriptions[i], Field.Store.YES, Field.Index.NO));
                // adding all given names
                StringBuilder all = new StringBuilder(255);

                List l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
                //                    System.out.println("NumberOfRelations: " + l.size());

                addToDocument(idxDocument, e, "//Agent/Name/GivenName", "GivenName", all);
                addToDocument(idxDocument, e, "//Agent/Name/FamilyName", "FamilyName", all);
                addToDocument(idxDocument, e, "//Label/Name", "Label", all);
                addToDocument(idxDocument, e, "//FreeTextAnnotation", "FreeTextAnnotation", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/Who/Name", "Who", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/Where/Name", "Where", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/How/Name", "How", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/Why/Name", "Why", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/When/Name", "When", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/WhatObject/Name", "WhatObjects", all);
                addToDocument(idxDocument, e, "//StructuredAnnotation/WhatAction/Name", "WhatAction", all);

                idxDocument.add(new Field("all", all.toString(), Field.Store.NO, Field.Index.TOKENIZED));

                writer.addDocument(idxDocument);

                if (statusBar != null) {
                    StringBuilder status = new StringBuilder(13).append(prefix);
                    status.append(df.format(((float) i) / numAllDocsPercent));
                    status.append('%');
                    statusBar.setStatus(status.toString());
                }

            } catch (Exception e1) {
                System.err.println("Error with file " + descriptions[i] + " (" + e1.getMessage() + ")");
            }
        }
        writer.optimize();
        writer.close();
        if (statusBar != null) {
            statusBar.setStatus("Indexing finished");
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:at.lux.fotoretrieval.retrievalengines.LuceneRetrievalEngine.java

License:Open Source License

public void indexFilesSemantically(String pathToIndex, StatusBar statusBar) {
    if (statusBar != null)
        statusBar.setStatus("Creating index from semantic annotations");

    SAXBuilder builder = new SAXBuilder();
    XMLOutputter outputter = new XMLOutputter(
            Format.getRawFormat().setIndent("").setLineSeparator("").setExpandEmptyElements(false));

    try {//from  w  ww. j a v a 2  s .co m
        String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true);
        if (descriptions == null)
            return;
        float numAllDocsPercent = (float) descriptions.length / 100f;
        DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
        df.setMaximumFractionDigits(1);

        // Preparing objects for the index:
        HashMap<String, ElementEntry> elementMap = new HashMap<String, ElementEntry>(descriptions.length);
        HashMap<Element, LinkedList<String>> element2document = new HashMap<Element, LinkedList<String>>(
                descriptions.length);

        // in the first run we identify the semantic objects that we want to index and build
        // a table were we can relate them to the documents (identified by their path)
        for (int i = 0; i < descriptions.length; i++) {
            try {
                Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                    Element semanticElement = (Element) iterator.next();
                    String xmlString = outputter.outputString(semanticElement).trim()
                            .replaceAll("id=\"id_[0-9]*\"", "");
                    // check if element is already there, indicator is its string representation.
                    if (!elementMap.keySet().contains(xmlString)) {
                        // its not here, put it in.
                        elementMap.put(xmlString, new ElementEntry(semanticElement, elementMap.size()));
                        //                            System.out.println(xmlString);
                    }
                    // now get the unified element
                    semanticElement = elementMap.get(xmlString).semanticElement;
                    // and check if there is an entry in the table for where to find the element
                    if (!element2document.keySet().contains(semanticElement)) {
                        element2document.put(semanticElement, new LinkedList<String>());
                    }
                    // and add found document if not already there:
                    List documentList = element2document.get(semanticElement);
                    if (!documentList.contains(descriptions[i]))
                        documentList.add(descriptions[i]);
                }
                if (statusBar != null)
                    statusBar.setStatus(
                            "Parsing documents for nodes: " + df.format((float) i / numAllDocsPercent));
            } catch (JDOMException e1) {
                System.err.println("Exception in document #" + i + ": " + e1.getMessage());
            } catch (IOException e1) {
                e1.printStackTrace();
            }
        }
        // read stats:
        // System.out.println("Got " + countOverallElements + " Elements in " + descriptions.length + " descriptions, " + elementMap.size() + " elements are pairwise different.");

        // Now we can add the nodes to a lucene index:
        // fields: label, id, type, files (separated by '|'), xml, all
        // -------------------------------------------

        // opening the index for writing:
        boolean createFlag = true;
        String indexDir = parseSemanticIndexDirectory(pathToIndex);
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);

        if (statusBar != null)
            statusBar.setStatus("Creating index for " + element2document.size() + " different available nodes");

        // iterating through nodes and storing them:
        for (Iterator<Element> iterator = element2document.keySet().iterator(); iterator.hasNext();) {
            Element semElement = iterator.next();
            // needed for later XPath :( otherwise everthing in the whole document is retrieved.

            String fileList = getFileListFromNode(element2document.get(semElement));
            Document idxDocument = new Document();
            // adding the file itself ...
            idxDocument.add(new Field("files", fileList, Field.Store.YES, Field.Index.NO));

            //                System.out.println(((Element) o).getTextTrim());

            StringBuilder all = new StringBuilder(255);
            // adding the label
            //                addToDocument(idxDocument, semElement, "//Label/Name", "label", all);
            String elementLabel = semElement.getChild("Label", semElement.getNamespace())
                    .getChildTextTrim("Name", semElement.getNamespace());
            idxDocument.add(new Field("label", elementLabel, Field.Store.YES, Field.Index.TOKENIZED));

            // adding the type:
            String elementType = semElement.getAttribute("type", xsi).getValue().trim();
            idxDocument.add(new Field("type", elementType, Field.Store.YES, Field.Index.NO));
            // adding the XML contents:
            String xmlString = outputter.outputString(semElement);
            idxDocument.add(new Field("xml", xmlString, Field.Store.YES, Field.Index.NO));
            // adding the id:
            idxDocument.add(
                    new Field("id", elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id + "",
                            Field.Store.YES, Field.Index.NO));
            // adding all, unstored for retrieval only
            List l = RetrievalToolkit.xpathQuery(semElement, "*//*", null);
            for (Iterator it3 = l.iterator(); it3.hasNext();) {
                Element e = (Element) it3.next();
                all.append(e.getTextTrim());
                all.append(" ");
            }
            idxDocument.add(new Field("all", all.toString(), Field.Store.NO, Field.Index.TOKENIZED));

            writer.addDocument(idxDocument);

        }
        // now optimize and close the index:
        // todo: open index for appending and/or updating
        writer.optimize();
        writer.close();

        // Now we can create the powerset for each existing graph
        // (based on sorted node ids) and store
        // all resulting graphs within an index.
        // ----------------------------------------------------------
        if (statusBar != null)
            statusBar.setStatus("Creating and merging powersets of available graphs");
        HashMap<Graph, HashSet<String>> graph2document = new HashMap<Graph, HashSet<String>>(
                descriptions.length);
        for (int i = 0; i < descriptions.length; i++) {
            try {
                Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                HashMap<String, Integer> docID2overallID = new HashMap<String, Integer>(l.size());
                LinkedList<Relation> relations = new LinkedList<Relation>();
                LinkedList<Integer> nodes = new LinkedList<Integer>();
                for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                    Element semanticElement = (Element) iterator.next();
                    String xmlString = outputter.outputString(semanticElement);
                    int id = elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id;
                    String docID = semanticElement.getAttribute("id").getValue();
                    docID2overallID.put(docID, id);
                    nodes.add(id);
                }
                // get all relations with global ids and eliminate inverse relations
                l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
                for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                    Element relation = (Element) iterator.next();
                    int source = docID2overallID.get(relation.getAttribute("source").getValue().substring(1));
                    int target = docID2overallID.get(relation.getAttribute("target").getValue().substring(1));
                    String type = relation.getAttribute("type").getValue();
                    type = type.substring(type.lastIndexOf(':') + 1);
                    Relation r = eliminateInverse(new Relation(source, target, type));
                    relations.add(r);
                }

                // now create a graph object
                Collections.sort(nodes);
                Collections.sort(relations);
                LinkedList<Node> nodeList = new LinkedList<Node>();
                for (Iterator<Integer> iterator = nodes.iterator(); iterator.hasNext();) {
                    nodeList.add(new Node(iterator.next()));
                }
                Graph g = new Graph(nodeList, relations);
                //                    List<Graph> powerSet = new LinkedList<Graph>();
                //                    powerSet.add(g);
                HashSet<String> docs = new HashSet<String>(1);
                docs.add(descriptions[i]);
                graph2document.put(g, docs);
                /*
                        
                                    // add all these subgraphs and the reference to the document to
                                    // a data structure:
                                    for (Iterator<Graph> iterator = powerSet.iterator(); iterator.hasNext();) {
                Graph graph = iterator.next();
                //                        List<Graph> relationsPowerSet = graph.getPowerSetOfRelations();
                //                        for (Iterator<Graph> iterator1 = relationsPowerSet.iterator(); iterator1.hasNext();) {
                //                            Graph graph1 = iterator1.next();
                //                        }
                // add graph if not trivial:
                if (graph.getNodes().size() > 1) {
                    // containsKey for Graph does not match my needs -
                    // different graph objects reference the same graph!
                    if (string2graph.containsKey(graph.toString())) {
                        graph = string2graph.get(graph.toString());
                        graph2document.get(graph).add(descriptions[i]);
                    } else {
                        HashSet<String> docs = new HashSet<String>(1);
                        docs.add(descriptions[i]);
                        graph2document.put(graph, docs);
                    }
                }
                                    }
                */
            } catch (JDOMException e1) {
                System.err.println("Exception in document #" + i + ": " + e1.getMessage());
            }
        }

        HashMap<String, Graph> str2graph = new HashMap<String, Graph>(graph2document.size() / 2);
        HashMap<Graph, HashSet<String>> g2d = new HashMap<Graph, HashSet<String>>(descriptions.length);

        /*
        For now we reduce the number of graphs by identifiying and merging duplicates and
        remove redundant entries:
        */
        for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
            Graph g = iterator.next();
            if (str2graph.containsKey(g.toString())) {
                g2d.get(str2graph.get(g.toString())).addAll(graph2document.get(g));
            } else {
                str2graph.put(g.toString(), g);
                g2d.put(g, graph2document.get(g));
            }
        }
        graph2document = g2d;
        System.out.println(graph2document.size() + " non trivial different graphs were found");
        // now put all the available graphs into an index:
        // -----------------------------------------------
        // todo: create real fast storable index of subgraphs instead of file :-) possible candidate a trie

        // for now we will store a simple text file:
        if (statusBar != null)
            statusBar.setStatus("Storing powersets of available graphs as file");
        String indexFile;
        if (!pathToIndex.endsWith(File.separator)) {
            indexFile = pathToIndex + File.separator + "idx_graphs.list";
        } else {
            indexFile = pathToIndex + "idx_graphs.list";
        }
        File f = new File(indexFile);
        BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(f, false))));
        for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
            Graph g = iterator.next();
            bw.write(g.toString());
            for (Iterator<String> iterator1 = graph2document.get(g).iterator(); iterator1.hasNext();) {
                String s = iterator1.next();
                bw.write("|" + s);
            }
            bw.write("\n");
        }
        bw.close();
    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:au.edu.unimelb.csse.analyser.NodeCacheTest.java

License:Apache License

public void testReusesNodesWhileIndexing() throws Exception {
    String[] sents = new String[] { "(A(B C)(D(E F)))", "(A(B(C D)))", "(A(B C)(D(E(F(G H)))))", "(A(B C))" };
    String[] jsonSents = new String[sents.length];
    String2NodesParser parser = new String2NodesParser();
    assertEquals(0, NodeCache.cacheSize());
    int[] expectedCounts = new int[] { 0, 2, 0, 5 };
    //First sent: 6 nodes are used but they are not returned until the next sentence is read. 
    //Hence the cache still returns a size of 0
    //Second sent: 6 nodes are returned back but the new sentence contains 4 nodes
    //6 - 4 = 2/*from   www .  j  a va 2 s .  co m*/
    //Third sent: 4 nodes are returned back but the new sentence contains 8 nodes
    //size shows 0 again
    //Fourth sent: 8 nodes are returned back but the new sentence contains 3 nodes
    //8 - 3 = 5

    for (int i = 0; i < sents.length; i++) {
        jsonSents[i] = parser.parse(sents[i]).asJSONString();
        assertEquals(expectedCounts[i], NodeCache.cacheSize());
    }
    Analyzer analyser = new NodeTreebankAnalyser(false);
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED);

    Document d = new Document();
    d.add(new Field("sent", jsonSents[0], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);
    //No change to Node cache
    assertEquals(5, NodeCache.cacheSize());

    d = new Document();
    d.add(new Field("sent", jsonSents[1], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);
    //No change to Node cache
    assertEquals(5, NodeCache.cacheSize());

    d = new Document();
    d.add(new Field("sent", jsonSents[2], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);
    //No change to Node cache
    assertEquals(5, NodeCache.cacheSize());

    d = new Document();
    d.add(new Field("sent", jsonSents[3], Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);
    //No change to Node cache
    assertEquals(5, NodeCache.cacheSize());

}

From source file:au.edu.unimelb.csse.join.JoinFunctionalTest.java

License:Apache License

/**
 * This test is actually commented out.. to run the test.. match counting has to be enabled in JoinLogic
 * @throws Exception/*from  w w  w.  j ava  2 s. com*/
 */
public void testNumberOfCallsToMatch() throws Exception {
    String sent = "(NP" + "(NP" + "(DT The)" + "(NN year))" + "(NP" + "(NP(CD 1956))" + "(PP" + "(IN in)"
            + "(NP(JJ rugby)(NN union))" + ")" + ")" + "(. .)" + ")";
    Analyzer analyser = new FastStringAnalyser();
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED);

    Document d = new Document();
    d.add(new Field("sent", sent, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);

    writer.close();

    IndexSearcher searcher = new IndexSearcher(dir);
    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE, false, 6);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE_WITH_FC, false, 1);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP, false, 2);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, false, 1);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE, true, 6);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.SIMPLE_WITH_FC, true, 5);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP, true, 6);

    assertNumberOfComparisons(searcher, "//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, true, 5);

    assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.SIMPLE, false, 23);

    assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.SIMPLE_WITH_FC, false, 10);

    assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.EARLY_STOP, false, 10);

    assertNumberOfComparisons(searcher, "//NP//NP//NP", TermJoinType.EARLY_STOP_WITH_FC, false, 8);

}

From source file:au.edu.unimelb.csse.join.JoinFunctionalTest.java

License:Apache License

public void testFilterjoin() throws Exception {
    String sent = "(NP" + "(NP" + "(DT The)" + "(NN year))" + "(NP" + "(NP(CD 1956))" + "(PP" + "(IN in)"
            + "(NP(JJ rugby)(NN union))" + ")" + ")" + "(. .)" + ")";
    Analyzer analyser = new FastStringAnalyser();
    RAMDirectory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, analyser, true, IndexWriter.MaxFieldLength.UNLIMITED);

    Document d = new Document();
    d.add(new Field("sent", sent, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS,
            Field.TermVector.WITH_POSITIONS));
    writer.addDocument(d);

    writer.close();//from  ww  w .j a  v  a2  s .  co  m

    IndexSearcher searcher = new IndexSearcher(dir);

    boolean[] lookaheadOptions = new boolean[] { false, true };
    for (TermJoinType type : TermJoinType.values()) {
        for (boolean lookahead : lookaheadOptions) {
            QueryBuilder builder = new QueryBuilder("//PP[/IN AND /NP]");
            TreebankQuery query = builder.parse(type, lookahead);
            SimpleHitCollector hitCollector = new SimpleHitCollector(10);
            searcher.search(query, hitCollector);
            assertEquals(1, hitCollector.totalHits);
        }
    }

    QueryBuilder builder = new QueryBuilder("//PP[/IN AND /NP/JJ/rugby]");
    TreebankQuery query = builder.parse(TermJoinType.SIMPLE, true);
    SimpleHitCollector hitCollector = new SimpleHitCollector(10);
    searcher.search(query, hitCollector);
    assertEquals(1, hitCollector.totalHits);

}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Creates the temporary index that provides a lookup of checklist bank id to
 * GUID/*w ww .j a v a2  s.co  m*/
 */
private IndexSearcher createTmpGuidIndex(String cbExportFile) throws Exception {
    System.out.println("Starting to create the tmp guid index...");
    IndexWriter iw = createIndexWriter(new File("/data/tmp/guid"), new KeywordAnalyzer(), true);
    au.com.bytecode.opencsv.CSVReader cbreader = new au.com.bytecode.opencsv.CSVReader(
            new FileReader(cbExportFile), '\t', '"', '/', 1);
    for (String[] values = cbreader.readNext(); values != null; values = cbreader.readNext()) {
        Document doc = new Document();
        String id = values[POS_ID];
        String guid = values[POS_LSID];
        doc.add(new StringField("id", id, Store.YES));
        if (StringUtils.isEmpty(id))
            guid = id;

        doc.add(new StoredField("guid", guid));
        iw.addDocument(doc);
    }
    System.out.println("Finished writing the tmp guid index...");
    iw.commit();
    iw.forceMerge(1);
    iw.close();
    //As of lucene 4.0 all IndexReaders are read only
    return new IndexSearcher(DirectoryReader.open(FSDirectory.open(new File("/data/tmp/guid"))));
}

From source file:au.org.ala.names.search.ALANameIndexer.java

License:Open Source License

/**
 * Adds the extra ALA concepts from the legislated lists that are missing from the NSL.
 *
 * @param iw// www  . j  a v a  2  s  .c o  m
 * @param file
 * @throws Exception
 */
private void addExtraALAConcept(IndexWriter iw, String file) throws Exception {
    if (new File(file).exists()) {
        au.com.bytecode.opencsv.CSVReader reader = new au.com.bytecode.opencsv.CSVReader(new FileReader(file),
                ',', '"', '\\', 1);
        for (String[] values = reader.readNext(); values != null; values = reader.readNext()) {
            String lsid = values[0];
            String scientificName = values[1];
            String authority = values[2];
            Document doc = createALAIndexDocument(scientificName, "-1", lsid, authority, null);
            iw.addDocument(doc);
        }
    }
}