Example usage for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException

Source Link

Document

Returns the stored fields of the n^th Document in this index.

Usage

From source file:net.sf.jclal.util.dataset.LuceneIndexToWekaDataSet.java

License:Open Source License

/**
 * It converts a index file of Lucene to a weka file for regression. The
 * weka file class are real. The used classifiers will work with numeric
 * real classe./* www .j  ava 2  s .  c  o  m*/
 *
 * @param wekaFileName Path of weka file.
 * @param indexFile Path of index file based on Lucene. The document indexes
 * must have fields called "class" and "content". WARNING: The fields must
 * not contains any puntuaction sign.
 *
 * @return Instances of weka. The instances are sparse since it is about
 * text information.
 *
 * @throws FileNotFoundException If the file does not exists.
 * @throws IOException If happens a error while writing the file.
 */
public Instances convertLuceneToWekaRegression(String wekaFileName, String indexFile)
        throws FileNotFoundException, IOException {
    File nuevo = new File(wekaFileName);

    if (!verify(nuevo)) {
        return null;
    }

    FileUtil.writeFile(nuevo, "@RELATION " + nuevo.getName() + doubleLine);

    IndexSearcher searcher = new IndexSearcher(indexFile);

    IndexReader reader = searcher.getIndexReader();

    int total = reader.maxDoc();

    HashMap<String, Integer> terms = new HashMap<String, Integer>(total * 2);
    HashMap<String, Integer> labels = new HashMap<String, Integer>(total * 2);

    int i;
    for (int l = 0; l < total; l++) {
        if (!reader.isDeleted(l)) {
            TermFreqVector vector = reader.getTermFreqVector(l, content);

            Document doc = reader.document(l);

            String current = doc.getField(classF).stringValue();

            if (!labels.containsKey(current)) {
                labels.put(current, labels.size());
            }

            if (vector != null) {
                String listosI[] = vector.getTerms();
                for (i = 0; i < listosI.length; i++) {
                    if (!terms.containsKey(listosI[i])) {
                        terms.put(listosI[i], terms.size());
                    }

                }
            }
        }
    }

    Container[] terminos = convertir(terms);
    Arrays.sort(terminos);

    for (int j = 0; j < terminos.length; j++) {
        FileUtil.writeFile(nuevo, "@ATTRIBUTE " + (int) terminos[j].getKey() + " NUMERIC" + "\n");
    }

    FileUtil.writeFile(nuevo, "@ATTRIBUTE class REAL [0.0,");

    FileUtil.writeFile(nuevo, (labels.size() - 1) + ".0]" + doubleLine);

    FileUtil.writeFile(nuevo, "@DATA\n");

    for (int pos = 0; pos < searcher.maxDoc(); pos++) {

        if (!reader.isDeleted(pos)) {

            TermFreqVector vector = reader.getTermFreqVector(pos, content);

            if (vector != null) {
                int[] origen = vector.getTermFrequencies();
                String[] termsI = vector.getTerms();

                int[] positions = new int[origen.length];

                for (int k = 0; k < origen.length; k++) {
                    positions[k] = terms.get(termsI[k]);
                }

                Container[] escribir = convertir(positions, origen);
                Arrays.sort(escribir);

                FileUtil.writeFile(nuevo, "{");
                for (int j = 0; j < escribir.length; j++) {
                    FileUtil.writeFile(nuevo, (int) escribir[j].getKey() + " " + escribir[j].getValue() + ",");
                }

                FileUtil.writeFile(nuevo, terms.size() + " "
                        + labels.get(searcher.doc(pos).getField(classF).stringValue()) + ".0}\n");
            }

        }
    }

    //close files
    closeReaders(searcher, reader);

    //Test if the weka file works
    Instances test = testWekaFile(wekaFileName);

    return test;
}

From source file:net.sf.logsaw.index.internal.LuceneIndexServiceImpl.java

License:Open Source License

private Date getLatestEntryDate(ILogResource log) throws CoreException {
    if (!hasDateComponent(log)) {
        return null;
    }//  ww  w.j  a v  a2 s.c o m

    ARunWithIndexReader<Date> runnable = new ARunWithIndexReader<Date>() {

        /* (non-Javadoc)
         * @see net.sf.logsaw.index.impl.ARunWithIndexReader#doRunWithIndexReader(org.apache.lucene.index.IndexReader, net.sf.logsaw.core.framework.ILogResource)
         */
        @Override
        protected Date doRunWithIndexReader(IndexReader reader, ILogResource log) throws CoreException {
            if (reader == null) {
                // Index does not exist yet
                return null;
            }
            int i = reader.maxDoc();
            if (i > 0) {
                try {
                    Document doc = reader.document(i - 1);
                    String val = doc.get(log.getDialect().getFieldProvider().getTimestampField().getKey());
                    return log.getDialect().getFieldProvider().getTimestampField().fromIndexedValue(val);
                } catch (IOException e) {
                    // Unexpected exception; wrap with CoreException
                    throw new CoreException(new Status(IStatus.ERROR, IndexPlugin.PLUGIN_ID,
                            NLS.bind(Messages.LuceneIndexService_error_failedToReadIndex,
                                    new Object[] { log.getName(), e.getLocalizedMessage() }),
                            e));
                }
            }
            return null;
        }
    };
    return runnable.runWithIndexReader(log);
}

From source file:net.sf.zekr.engine.search.lucene.QuranTextSearcher.java

@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
    BitSet bits = new BitSet(reader.maxDoc());
    for (int i = 0; i < reader.maxDoc(); i++) {
        Document doc = reader.document(i);
        IQuranLocation loc = new QuranLocation(doc.getField("location").stringValue());
        if (searchScope.includes(loc)) {
            bits.set(i);/*from  w  w w  .j  av a2s  . co  m*/
        }
    }
    return new DocIdBitSet(bits);
}

From source file:nl.elucidator.maven.analyzer.indexer.IndexSearcher.java

License:Apache License

public Set<ArtifactInfo> getUniqueGAV() throws IOException, ComponentLookupException {
    IndexingContext centralContext = indexUpdater.getIndexContext();
    centralContext.lock();/* w w w .j a va  2 s.  c o  m*/
    Set<ArtifactInfo> artifactInfoSet = new HashSet<ArtifactInfo>();

    try {
        final IndexReader ir = centralContext.getIndexReader();

        for (int i = 0; i < ir.maxDoc(); i++) {
            if (!ir.isDeleted(i)) {
                final Document doc = ir.document(i);

                final ArtifactInfo ai = IndexUtils.constructArtifactInfo(doc, centralContext);
                if (ai != null) {
                    artifactInfoSet.add(ai);
                }
            }
        }

    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        centralContext.unlock();
    }
    return artifactInfoSet;
}

From source file:nl.inl.blacklab.perdocument.DocResults.java

License:Apache License

/**
 * Construct DocResults from a Scorer (Lucene document results).
 *
 * @param searcher the searcher that generated the results
 * @param scorer the scorer to read document results from
 */// ww w.  j a v a2 s.  c o m
DocResults(Searcher searcher, Scorer scorer) {
    this.searcher = searcher;
    if (scorer == null)
        return; // no matches, empty result set
    try {
        IndexReader indexReader = searcher.getIndexReader();
        while (true) {
            int docId;
            try {
                docId = scorer.nextDoc();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            if (docId == DocIdSetIterator.NO_MORE_DOCS)
                break;

            Document d = indexReader.document(docId);
            DocResult dr = new DocResult(searcher, null, docId, d, scorer.score());
            results.add(dr);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:nl.inl.blacklab.perdocument.DocResults.java

License:Apache License

private void addDocResultToList(int doc, Hits docHits, IndexReader indexReader) throws IOException {
    DocResult docResult = new DocResult(searcher, sourceHits.getConcordanceFieldName(), doc,
            indexReader == null ? null : indexReader.document(doc), docHits);
    // Make sure we remember what kind of context we have, if any
    docResult.setContextField(sourceHits.getContextFieldPropName());
    results.add(docResult);//w w  w . j  a v  a 2s.  c o m
}

From source file:nl.uva.mlc.eurovoc.analyzer.PropagationAnalyzer.java

private void testIndexDocReader() {
    try {/*from w w w . j  ava  2s .c  o m*/
        IndexReader testIreader = IndexReader
                .open(new SimpleFSDirectory(new File(configFile.getProperty("TEST_INDEX_PATH"))));
        for (int i = 0; i < testIreader.numDocs(); i++) {
            String id = testIreader.document(i).get("ID");
            String title = testIreader.document(i).get("TITLE");
            String text = testIreader.document(i).get("TEXT");
            String namedEntities = testIreader.document(i).get("NAMEDENTITIES");
            String[] classes = testIreader.document(i).get("CLASSES").split("\\s+");
            EuroVocDoc doc = new EuroVocDoc(id, title, text, namedEntities,
                    new ArrayList<String>(Arrays.asList(classes)));
            Quering(doc);
            log.info(i + " from " + testIreader.numDocs());
        }
    } catch (IOException ex) {
        log.error(ex);
    }
}

From source file:nmsu.cs.TFIDFVector.java

License:Open Source License

/**
 * calculate likelihood from the index/*from  ww  w  .ja  va 2s .  c o m*/
  * @param indexDir
 * @param lambda
 */
public void calLikelihoodFromIndex(String indexDir, double lambda) {
    try {
        IndexReader ir = IndexReader.open(FSDirectory.open(new File(indexDir)));
        IndexSearcher is = new IndexSearcher(ir);
        int numDocs = ir.maxDoc();

        double LLH = 0;

        //vocabulary list
        List<String> vocab = new ArrayList<String>();

        TermEnum te = ir.terms();
        //create vocabulary
        while (te.next()) {
            String term = te.term().text();
            //            System.out.println(term);
            vocab.add(term);
        }
        TFIDFVector.vocabulary = vocab;

        //dataset id to index id
        Map<Integer, Integer> idMap = new HashMap<Integer, Integer>();

        for (int i = 0; i < numDocs; i++) {
            Document doc = ir.document(i);
            idMap.put(Integer.parseInt(doc.get("docid")), i);
        }

        //o -> a -> o'
        Map<Integer, Map<Integer, Map<Integer, Double>>> cosineSimMap = new HashMap<Integer, Map<Integer, Map<Integer, Double>>>();
        // (o | o') dataset id -> tfidf vector
        Map<Integer, TFIDFVector> docVectorMap = new HashMap<Integer, TFIDFVector>();
        // o -> a -> vector
        Map<Integer, Map<Integer, TFIDFVector>> docAspectVectorMap = new HashMap<Integer, Map<Integer, TFIDFVector>>();

        Set<Integer> citedSet = new HashSet<Integer>();
        //for all citing document
        for (Map.Entry<Integer, List<Integer>> entry : rawdata.pubId2CiteIds.entrySet()) {//llh for citing documents
            int citingDatasetID = entry.getKey();
            int citingIndexID = idMap.get(citingDatasetID);

            //set up citing document vector
            TFIDFVector citingVector = BaseLineMethod.getFullTextTFIDFVector(docVectorMap, ir, citingDatasetID,
                    citingIndexID, numDocs);
            float sum = citingVector.sum();

            //            System.out.println(Debugger.getCallerPosition()+" "+citingDatasetID);

            List<Integer> refList = entry.getValue();
            //for all aspects
            for (Integer aspectID : rawdata.id2Aspect.keySet()) {
                String aspect = rawdata.id2Aspect.get(aspectID);
                //set up citing document aspect vector
                double aspectSim = 0;
                if (rawdata.id2Docs.get(citingDatasetID).getText().get(aspectID).length() != 0) {
                    TFIDFVector citingAspectVector = BaseLineMethod.getAspectTFIDFVector(docAspectVectorMap, ir,
                            citingDatasetID, citingIndexID, aspectID, numDocs);
                    citingAspectVector.normalizedBy(sum);

                    int refSize = refList.size();
                    TFIDFVector[] citedVectors = new TFIDFVector[refSize];
                    double[] cosineSims = new double[refSize];
                    int count = 0;

                    //for all cited documents of this citing document
                    for (Integer citedDatasetID : refList) {
                        citedSet.add(citedDatasetID);
                        //set up cited document vector
                        int citedIndexID = idMap.get(citedDatasetID);
                        TFIDFVector citedVector = BaseLineMethod.getFullTextTFIDFVector(docVectorMap, ir,
                                citedDatasetID, citedIndexID, numDocs);
                        citedVector.normalize();

                        aspectSim = TFIDFVector.computeCosineSim(citedVector, citingAspectVector);
                        //                     System.out.println(Debugger.getCallerPosition()+"\t\t"+aspectSim);
                        System.out.println(
                                citingDatasetID + "\t" + aspectID + "\t" + citedDatasetID + "\t" + aspectSim);
                        citedVectors[count] = citedVector;
                        cosineSims[count] = aspectSim;
                        count++;
                    }
                    double aspectLLH = citingAspectVector.posteriorLLH(citedVectors, cosineSims, lambda);
                    LLH += aspectLLH;
                }
                //                  Util.update3Map(cosineSimMap, citingDatasetID, aspectID, citedDatasetID, aspectSim);
            }
        }

        for (Integer citedDatasetID : citedSet) {
            int citedIndexID = idMap.get(citedDatasetID);
            TFIDFVector citedVector = BaseLineMethod.getFullTextTFIDFVector(docVectorMap, ir, citedDatasetID,
                    citedIndexID, numDocs);
            citedVector.normalize();
            LLH += citedVector.priorLLH();
        }

        System.out.println(LLH);
        is.close();
        ir.close();

    } catch (CorruptIndexException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

From source file:opennlp.tools.similarity.apps.solr.IterativeSearchRequestHandler.java

License:Apache License

public DocList filterResultsBySyntMatchReduceDocSet(DocList docList, SolrQueryRequest req, SolrParams params) {
    //if (!docList.hasScores()) 
    //   return docList;

    int len = docList.size();
    if (len < 1) // do nothing
        return docList;
    ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor.getInstance();

    DocIterator iter = docList.iterator();
    float[] syntMatchScoreArr = new float[len];
    String requestExpression = req.getParamString();
    String[] exprParts = requestExpression.split("&");
    for (String part : exprParts) {
        if (part.startsWith("q="))
            requestExpression = part;/*  www .j  a v a  2 s .  c  om*/
    }
    String fieldNameQuery = StringUtils.substringBetween(requestExpression, "=", ":");
    // extract phrase query (in double-quotes)
    String[] queryParts = requestExpression.split("\"");
    if (queryParts.length >= 2 && queryParts[1].length() > 5)
        requestExpression = queryParts[1].replace('+', ' ');
    else if (requestExpression.indexOf(":") > -1) {// still field-based expression
        requestExpression = requestExpression.replaceAll(fieldNameQuery + ":", "").replace('+', ' ')
                .replaceAll("  ", " ").replace("q=", "");
    }

    if (fieldNameQuery == null)
        return docList;
    if (requestExpression == null || requestExpression.length() < 5 || requestExpression.split(" ").length < 3)
        return docList;
    int[] docIDsHits = new int[len];

    IndexReader indexReader = req.getSearcher().getIndexReader();
    List<Integer> bestMatchesDocIds = new ArrayList<Integer>();
    List<Float> bestMatchesScore = new ArrayList<Float>();
    List<Pair<Integer, Float>> docIdsScores = new ArrayList<Pair<Integer, Float>>();
    try {
        for (int i = 0; i < docList.size(); ++i) {
            int docId = iter.nextDoc();
            docIDsHits[i] = docId;
            Document doc = indexReader.document(docId);

            // get text for event
            String answerText = doc.get(fieldNameQuery);
            if (answerText == null)
                continue;
            SentencePairMatchResult matchResult = pos.assessRelevance(requestExpression, answerText);
            float syntMatchScore = new Double(
                    parseTreeChunkListScorer.getParseTreeChunkListScore(matchResult.getMatchResult()))
                            .floatValue();
            bestMatchesDocIds.add(docId);
            bestMatchesScore.add(syntMatchScore);
            syntMatchScoreArr[i] = (float) syntMatchScore; //*iter.score();
            System.out.println(" Matched query = '" + requestExpression + "' with answer = '" + answerText
                    + "' | doc_id = '" + docId);
            System.out.println(" Match result = '" + matchResult.getMatchResult() + "' with score = '"
                    + syntMatchScore + "';");
            docIdsScores.add(new Pair(docId, syntMatchScore));
        }

    } catch (CorruptIndexException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
        //log.severe("Corrupt index"+e1);
    } catch (IOException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
        //log.severe("File read IO / index"+e1);
    }

    Collections.sort(docIdsScores, new PairComparable());
    for (int i = 0; i < docIdsScores.size(); i++) {
        bestMatchesDocIds.set(i, docIdsScores.get(i).getFirst());
        bestMatchesScore.set(i, docIdsScores.get(i).getSecond());
    }
    System.out.println(bestMatchesScore);
    float maxScore = docList.maxScore(); // do not change
    int limit = docIdsScores.size();
    int start = 0;
    DocSlice ds = null;

    ds = new DocSlice(start, limit, ArrayUtils.toPrimitive(bestMatchesDocIds.toArray(new Integer[0])),
            ArrayUtils.toPrimitive(bestMatchesScore.toArray(new Float[0])), bestMatchesDocIds.size(), maxScore);

    return ds;
}

From source file:opennlp.tools.similarity.apps.solr.IterativeSearchRequestHandler.java

License:Apache License

private void append(SolrDocumentList results, ScoreDoc[] more, Set<Integer> alreadyFound,
        Map<String, SchemaField> fields, Map<String, Object> extraFields, float scoreCutoff, IndexReader reader,
        boolean includeScore) throws IOException {
    for (ScoreDoc hit : more) {
        if (alreadyFound.contains(hit.doc)) {
            continue;
        }/*from  ww  w  . j a  va2  s.  co  m*/
        Document doc = reader.document(hit.doc);
        SolrDocument sdoc = new SolrDocument();
        for (String fieldname : fields.keySet()) {
            SchemaField sf = fields.get(fieldname);
            if (sf.stored()) {
                sdoc.addField(fieldname, doc.get(fieldname));
            }
        }
        for (String extraField : extraFields.keySet()) {
            sdoc.addField(extraField, extraFields.get(extraField));
        }
        if (includeScore) {
            sdoc.addField("score", hit.score);
        }
        results.add(sdoc);
        alreadyFound.add(hit.doc);
    }
}