Example usage for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException

Source Link

Document

Returns the stored fields of the n^th Document in this index.

Usage

From source file:engine.easy.search.EasySearchEngine.java

License:Apache License

/**
 * Computes the results on ranking function and other scoring factors.
 * //from ww w . jav  a 2  s.  c  o  m
 * @param terms the query terms
 * @param ixReader the index reader
 * @param esiReader the custom easy index reader
 * @param numberOfResults the number of results return back
 * @return the Results.
 * @throws Exception if one is thrown.
 */
public Result[] getResults(Query query, IndexReader ixReader, EasySearchIndexReader esiReader,
        Map<Integer, Float> relevanceDocMap) {

    Map<Integer, Result> results = null;

    try {
        Set<Term> terms = new HashSet<Term>();
        query.extractTerms(terms);

        results = new HashMap<Integer, Result>();
        Iterator<Term> itr = terms.iterator();

        while (itr.hasNext()) {
            Term term = itr.next();

            TermDocs docs = ixReader.termDocs(term);
            int docFreq = ixReader.docFreq(term); // get the document frequency of the term from lucene's index reader
            int docNum = esiReader.recordCount(AppConstants.CONTENT_FIELD); // get the total record of the field from lucene extra index (you may think it is also possible to use ixreader.maxDoc() here, but the ixreader.maxDoc() only returns the number of documents, while some documents may not have the search field (although every document has the search field in this example))

            while (docs.next()) {
                Integer id = docs.doc(); // get the internal lucene's id of the document
                int termFreq = docs.freq(); // get the frequency of the term in this document
                int docLen = esiReader.docLength(id, AppConstants.CONTENT_FIELD); // get the length of the document from lucene extra index.
                double avgDocLen = esiReader.avgFieldLength(AppConstants.CONTENT_FIELD); // get the average length of the search field from lucene extra index.
                Document document = ixReader.document(id); //get the particular document.
                String storedField = extractData(document.get(AppConstants.CONTENT_FIELD));

                // Compute the scoring with BM25 ranking and also include other scoring factors such as (relevance feedback based on terms) 
                BM25 bm25 = new BM25();
                //System.out.println(bm25.getInfo());

                // Also add the document boost in the ranking score.
                double termWeight = bm25.score(termFreq, docNum, docLen, avgDocLen, 1d, docFreq);

                //Add each document relevance score!
                if (relevanceDocMap != null && !relevanceDocMap.isEmpty() && relevanceDocMap.containsKey(id))
                    termWeight = termWeight * relevanceDocMap.get(id);

                //System.out.println("lucene id" + id  + " Doc id " + document.getField("DOCID").stringValue() + "wieght" + termWeight);

                if (results.containsKey(id)) {
                    results.get(id).score = results.get(id).score + termWeight;
                } else {
                    Result result = new Result(new Integer(id), document.getField("DOCID").stringValue(),
                            termWeight, storedField);
                    results.put(id, result);
                }
            }
        }

        return sortArray(results, AppConstants.TOP_RESULTS);

    } catch (Exception e) {
        System.out.println("Exception: getResults " + e.toString());
    }

    return null;
}

From source file:engine.easy.search.EasySearchEngine.java

License:Apache License

public String highlightedText() {

    try {/*from   w w w  . ja v  a2 s. c o m*/
        Analyzer analyzer = new EasySearchAnalyzer();

        PhraseQuery phraseQuery = new PhraseQuery();
        phraseQuery.add(new Term("CONTENT", "KENNEDY"));
        phraseQuery.add(new Term("CONTENT", "ADMINISTRATION"));

        Directory indexDir = FSDirectory.open(new File(AppConstants.INDEX_DIR_PATH));
        IndexReader indexReader = IndexReader.open(indexDir);

        Query query = getQuery(phraseQuery.toString());
        QueryScorer scorer = new QueryScorer(query, AppConstants.CONTENT_FIELD);
        Highlighter highlighter = new Highlighter(scorer);

        Set<Term> terms = new HashSet<Term>();
        query.extractTerms(terms);

        Iterator<Term> itr = terms.iterator();
        StringBuffer text = new StringBuffer("");

        while (itr.hasNext()) {
            Term term = itr.next();
            TermDocs docs = indexReader.termDocs(term);

            while (docs.next()) {
                Integer id = docs.doc();
                Document document = indexReader.document(id);

                TokenStream stream = analyzer.tokenStream("FIELDNAME", new StringReader(text.toString()));

                //Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
                //highlighter.setTextFragmenter(fragmenter);

                //String fragment = highlighter.getBestFragment(analyzer, AppConstants.CONTENT_FIELD, storedField);
                //System.out.println(storedField); 
            }
        }
    } catch (Exception e) {
        System.out.println("Exception: getResults " + e.toString());
    }

    return null;
}

From source file:engine.easy.search.EasySearchEngine.java

License:Apache License

/**
 * Display the results in highest ranking order
 * /* ww w. j ava2  s  .  c  om*/
 * @param the array with results detail
 * @param ixReader the index reader
 * @throws Exception if one is thrown.
 */
public void displayResults(Result[] results, IndexReader ixReader) {

    try {
        // Now output ranked results;
        for (int pos = 0; pos < results.length; pos++) {
            Result result = results[pos];
            int id = result.id;
            double score = result.score;
            Document doc = ixReader.document(result.id); // Also, you can get the document from index reader
            String docid = doc.getField("DOCID").stringValue();
            //System.out.println("Result No."+(pos+1)+": Lucene id = "+id+", DOCID = "+docid+", score = "+score);
        }
    } catch (Exception e) {
        System.out.println("Exception - displayResults: " + e.toString());
    }
}

From source file:engine.easy.search.RelevanceFeedBackUtil.java

License:Apache License

/**
 * This method will perform the thumbs up action. And generate the new query
 * based on top specific highest terms. It also increase the relevant
 * document boost so that their ranking is higher in search results for the
 * similar terms./* w w w  .j  a  va  2  s .com*/
 */
public static Query performThumbsUp(List<Integer> luceneDocIds) throws IOException {

    Query q = null;

    try {
        final Map<String, Integer> frequencyMap = new HashMap<String, Integer>();
        Map<Integer, Document> documentMap = new HashMap<Integer, Document>();
        List<String> termsList = new ArrayList<String>();

        Directory indexDir = FSDirectory.open(new File(AppConstants.INDEX_DIR_PATH));
        IndexReader indexReader = IndexReader.open(indexDir);
        EasySearchIndexReader esiReader = new EasySearchIndexReader(indexReader);

        for (Integer docId : luceneDocIds) {

            TermFreqVector tfv = indexReader.getTermFreqVector(docId, "CONTENT");
            Document doc = indexReader.document(docId);
            float boost = doc.getBoost() + AppConstants.THUMBS_UP;
            doc.setBoost(boost);

            System.out.print("DOC : " + docId + " Field : " + tfv.getField() + "\n");

            for (int i = 0; i < tfv.getTermFrequencies().length; i++) {
                if (!termsList.contains(tfv.getTerms()[i]))
                    termsList.add(tfv.getTerms()[i]);

                System.out.println("TERM : " + tfv.getTerms()[i] + " FREQ : " + tfv.getTermFrequencies()[i]);
                frequencyMap.put(tfv.getTerms()[i], tfv.getTermFrequencies()[i]);
            }

            //put the document with doc id.
            documentMap.put(docId, doc);
        }

        //close the index reader;
        indexReader.close();

        //Boost the terms visibility in documents, so these documents more frequently for specific search terms.
        q = computeTopTermQuery(termsList, frequencyMap, AppConstants.TOP_DOCUMENTS);
        q.setBoost(2.0F);

        //Update the documents with their boost.
        //EasySearchIndexBuilder.updateDocuments(documentMap);

    } catch (Exception e) {
        System.out.println("Exception: performThumbsUp" + e.toString());
    }

    return q;
}

From source file:engine.easy.search.RelevanceFeedBackUtil.java

License:Apache License

/**
 * This method will perform the thumbs down action. And generate the new
 * query based on top specific highest terms. It also decrease the relevant
 * document boost so that their ranking is lower in search results for the
 * similar terms./*from w  w  w . j a v a  2s.com*/
 */
public static Query performThumbsDown(List<Integer> luceneDocIds) {

    Query q = null;

    try {
        final Map<String, Integer> frequencyMap = new HashMap<String, Integer>();
        Map<Integer, Document> documentMap = new HashMap<Integer, Document>();
        List<String> termsList = new ArrayList<String>();

        Directory indexDir = FSDirectory.open(new File(AppConstants.INDEX_DIR_PATH));
        IndexReader indexReader = IndexReader.open(indexDir);
        EasySearchIndexReader esiReader = new EasySearchIndexReader(indexReader);

        for (Integer docId : luceneDocIds) {

            TermFreqVector tfv = indexReader.getTermFreqVector(docId, "CONTENT");
            Document doc = indexReader.document(docId);
            float boost = doc.getBoost() + AppConstants.THUMBS_UP;
            doc.setBoost(boost);

            System.out.print("DOC : " + docId + " Field : " + tfv.getField() + "\n");

            for (int i = 0; i < tfv.getTermFrequencies().length; i++) {
                if (!termsList.contains(tfv.getTerms()[i]))
                    termsList.add(tfv.getTerms()[i]);

                System.out.println("TERM : " + tfv.getTerms()[i] + " FREQ : " + tfv.getTermFrequencies()[i]);
                frequencyMap.put(tfv.getTerms()[i], tfv.getTermFrequencies()[i]);
            }

            //put the document with doc id.
            documentMap.put(docId, doc);
        }

        //close the index reader;
        indexReader.close();

        //Boost the terms visibility in documents, so these documents more frequently for specific search terms.
        q = computeTopTermQuery(termsList, frequencyMap, AppConstants.TOP_DOCUMENTS);
        q.setBoost(-2.0F);

        //Update the documents with their boost.
        //EasySearchIndexBuilder.updateDocuments(documentMap);

    } catch (Exception e) {
        System.out.println("Exception: performThumbsUp" + e.toString());
    }

    return q;
}

From source file:engine.easy.search.RelevanceFeedBackUtil.java

License:Apache License

/**
 * This method will perform the thumbs down action. And generate the new
 * query based on top specific highest terms. It also decrease the relevant
 * document boost so that their ranking is lower in search results for the
 * similar terms./*from w ww .jav a  2 s . co m*/
 */
public static Query performPesduoRelevance(Result[] results) {

    Query q = null;

    try {
        final Map<String, Integer> frequencyMap = new HashMap<String, Integer>();
        Map<Integer, Document> documentMap = new HashMap<Integer, Document>();
        List<String> termsList = new ArrayList<String>();

        Directory indexDir = FSDirectory.open(new File(AppConstants.INDEX_DIR_PATH));
        IndexReader indexReader = IndexReader.open(indexDir);
        EasySearchIndexReader esiReader = new EasySearchIndexReader(indexReader);
        float boost = 0F;

        for (Result result : results) {

            TermFreqVector tfv = indexReader.getTermFreqVector(result.id, AppConstants.CONTENT_FIELD);
            Document doc = indexReader.document(result.id);
            boost += doc.getBoost() + AppConstants.THUMBS_UP;

            System.out.print("DOC : " + result.id + " Field : " + tfv.getField() + "\n");

            for (int i = 0; i < tfv.getTermFrequencies().length; i++) {
                if (!termsList.contains(tfv.getTerms()[i]))
                    termsList.add(tfv.getTerms()[i]);

                frequencyMap.put(tfv.getTerms()[i], tfv.getTermFrequencies()[i]);
            }
        }

        //close the index reader;
        indexReader.close();

        //Boost the terms visibility in documents, so these documents more frequently for specific search terms.
        q = computeTopTermQuery(termsList, frequencyMap, AppConstants.TOP_DOCUMENTS);
        q.setBoost(boost);
        System.out.print("Query boost : " + boost);

    } catch (Exception e) {
        System.out.println("Exception: performThumbsUp" + e.toString());
    }

    return q;
}

From source file:engine.easy.search.RelevanceFeedBackUtil.java

License:Apache License

public static Query performUpAndDown(Map<Integer, Float> docMap) throws IOException {
    float boost = 0.0F;
    //String[] Ids = ids.split(",");

    Query q = null;//  www  . java 2s .  c o  m

    try {
        final Map<String, Integer> frequencyMap = new HashMap<String, Integer>();
        Map<Integer, Document> documentMap = new HashMap<Integer, Document>();
        List<String> termsList = new ArrayList<String>();

        Directory indexDir = FSDirectory.open(new File(AppConstants.INDEX_DIR_PATH));
        IndexReader indexReader = IndexReader.open(indexDir);
        EasySearchIndexReader esiReader = new EasySearchIndexReader(indexReader);

        for (Integer docId : docMap.keySet()) {

            TermFreqVector tfv = indexReader.getTermFreqVector(docId, AppConstants.CONTENT_FIELD);
            Document doc = indexReader.document(docId);
            System.out.print("DOC : " + docId + " Field : " + tfv.getField() + "\n");

            for (int i = 0; i < tfv.getTermFrequencies().length; i++) {
                if (!termsList.contains(tfv.getTerms()[i]))
                    termsList.add(tfv.getTerms()[i]);

                System.out.println("TERM : " + tfv.getTerms()[i] + " FREQ : " + tfv.getTermFrequencies()[i]);
                frequencyMap.put(tfv.getTerms()[i], tfv.getTermFrequencies()[i]);
            }

            // put the document with doc id.
            documentMap.put(docId, doc);
        }

        // close the index reader;
        indexReader.close();

        // Boost the terms visibility in documents, so these documents more
        // frequently for specific search terms.
        q = computeTopTermQuery(termsList, frequencyMap, AppConstants.TOP_DOCUMENTS);
        q.setBoost(AppConstants.BOOST);

    } catch (Exception e) {
        System.out.println("Exception: performThumbsUp" + e.toString());
    }

    return q;
}

From source file:es.pode.indexador.negocio.servicios.busqueda.SrvBuscadorServiceImpl.java

License:Open Source License

/**
 * Este metodo busca un ODE al azar de dentro del repositorio.
 * @return DocVO Detalle de un ODE indexado.
 *//*from w w  w. j ava 2  s. c om*/
protected DocVO handleObtenerODERandom() throws Exception {
    List listaIndices = (List) this.getIndiceDao().loadAll(getIndiceDao().TRANSFORM_INDICEVO);
    if (listaIndices.size() == 0)//      No hay indices que listar, no devuelvo nada
        return null;
    Random random = new Random(Calendar.getInstance().getTimeInMillis());
    Document doc = null;
    boolean noCero = true;
    int intRandom = random.nextInt();
    int i = 0;
    int reintentosInt = 10; //Puede que intRandom sea 0, para que no sea as  haremos 10 intentos como mucho
    for (i = 0; i < reintentosInt && intRandom != 0 && noCero; i++) {

        //      Sacamos el indice aleatoriamente de todos los indices del repositorio
        int idiomaSeleciconado = (intRandom < 0 ? (intRandom * (-1)) : intRandom) % listaIndices.size();
        IndiceVO indice = (IndiceVO) listaIndices.get(idiomaSeleciconado);
        //      Abrimos el indice y vemos el numero de documentos indexados
        Directory directorioIndiceSimple = null;
        directorioIndiceSimple = this.getIndexByLanguage(indice.getIdentificador());
        IndexReader indiceLectura = IndexReader.open(directorioIndiceSimple);
        int numeroDocumentos = indiceLectura.numDocs();
        logger.debug("El numero de documentos del indice es " + numeroDocumentos);
        //      Seleccionamos el documento que vamos a extraer
        if (numeroDocumentos > 0) {
            intRandom = random.nextInt();
            noCero = false;
            int documentoSeleccionado = (intRandom < 0 ? (intRandom * (-1)) : intRandom) % numeroDocumentos;
            logger.info("Devuelto documento [" + documentoSeleccionado + "] de [" + numeroDocumentos
                    + "] documentos totales indexados.");
            doc = indiceLectura.document(documentoSeleccionado);
        }
        indiceLectura.close();
    }
    if (i == reintentosInt && noCero) {
        logger.info("No se ha encontrado ning n random  v lido en [" + reintentosInt + "] intentos");
    }
    if (doc != null)
        return getVOFromLucene(doc, new DocVO(), 0);
    else
        return null;
}

From source file:fr.ericlab.sondy.algo.eventdetection.TrendingScore.java

License:Open Source License

public ObservableList<DetectionResult> apply() {
    try {/*from  w  w w . j a va 2  s.  c  om*/
        if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) {
            minTermSupport = Double.parseDouble(parameters.get(0).getValue());
        }
        if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) {
            maxTermSupport = Double.parseDouble(parameters.get(1).getValue());
        }
        if (parameters.get(2).getValue() != null && !parameters.get(2).getValue().equals("")) {
            trendingThreshold = Double.parseDouble(parameters.get(2).getValue());
        }
        long startNanoTime = System.nanoTime();
        IndexAccess indexAccess = new IndexAccess(appVariables);
        IndexReader r = indexAccess.reader;
        TermEnum allTerms = r.terms();
        HashMap<DetectionResult, Float> score = new HashMap<>();
        int intervalNumber = r.numDocs();
        float intervalDuration = ((float) appVariables.getCurrentDatasetInterval()) / 60;
        int minTermOccur = (int) (minTermSupport * appVariables.nbMessages),
                maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages);
        int[] nbWordsPerDoc = new int[r.numDocs()];
        for (int luceneId : appVariables.globalIdMap.keySet()) {
            int sliceId = appVariables.globalIdMap.get(luceneId);
            Document doc = r.document(luceneId);
            String content = doc.get("content");
            int count = 0;
            for (int i = 0; i < content.length(); i++) {
                if (Character.isWhitespace(content.charAt(i)))
                    count++;
            }
            nbWordsPerDoc[sliceId] = count;
        }
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (term.length() > 1 && !appVariables.isStopWord(term)) {
                TermDocs termDocs = r.termDocs(allTerms.term());
                float frequency[] = indexAccess.getTermFrequency(appVariables, termDocs);
                float cf = frequency[intervalNumber];
                if (cf > minTermOccur && cf < maxTermOccur) {
                    double[] tfnorm = new double[intervalNumber];
                    double tfnormTotal = 0;
                    double[] trendingScore = new double[intervalNumber];
                    for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) {
                        tfnorm[i] = (frequency[i] / nbWordsPerDoc[i]) * Math.pow(10, 6);
                        tfnormTotal += tfnorm[i];
                    }
                    for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) {
                        trendingScore[i] = tfnorm[i] / ((tfnormTotal - tfnorm[i]) / (intervalNumber - 1));
                        if (trendingScore[i] > trendingThreshold) {
                            float dayS = (i * intervalDuration) / 24;
                            float dayE = ((i + 1) * intervalDuration) / 24;
                            score.put(
                                    new DetectionResult(term,
                                            formatter.format(dayS) + ";" + formatter.format(dayE)),
                                    (float) trendingScore[i]);
                        }
                    }
                }
            }
        }
        indexAccess.close();
        score = Collection.getSortedMapDesc(score);
        Set<Map.Entry<DetectionResult, Float>> entrySet = score.entrySet();
        results = FXCollections.observableArrayList();
        for (Map.Entry<DetectionResult, Float> entry : entrySet) {
            results.add(0, entry.getKey());
        }
        long endNanoTime = System.nanoTime();
        long elapsedNanoTime = endNanoTime - startNanoTime;
        double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000;
        appVariables.addLogEntry("[event detection] computed trending scores, minTermSupport=" + minTermSupport
                + ", maxTermSupport=" + maxTermSupport + ", trendingThreshold=" + trendingThreshold + ". "
                + results.size() + " results in " + formatter.format(elaspedSecondTime) + "s");
        return results;
    } catch (IOException ex) {
        Logger.getLogger(PeakyTopics.class.getName()).log(Level.SEVERE, null, ex);
        return null;
    }
}

From source file:game.TermFreq.java

void loadDoc() throws Exception {
    IndexReader reader = retriever.getReader();
    IndexSearcher searcher = retriever.getSearcher();

    Term docIdTerm = new Term(TrecDocRetriever.FIELD_ID, this.docIdToGuess);
    TermQuery tq = new TermQuery(docIdTerm);

    TopScoreDocCollector collector = TopScoreDocCollector.create(1, true);
    searcher.search(tq, collector);/*from   w w  w  . j  a v a2 s . com*/
    this.luceneDocIdToGuess = collector.topDocs().scoreDocs[0].doc;
    this.docToGuess = reader.document(luceneDocIdToGuess);
    this.contentOfDocToGuess = docToGuess.get(FIELD_ANALYZED_CONTENT);
}