Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:fr.ericlab.sondy.algo.eventdetection.PersistentConversations.java

License:Open Source License

public ObservableList<DetectionResult> apply() {
    try {//w w w .j  a v a 2 s  . c o  m
        if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) {
            minTermSupport = Double.parseDouble(parameters.get(0).getValue());
        }
        if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) {
            maxTermSupport = Double.parseDouble(parameters.get(1).getValue());
        }
        long startNanoTime = System.nanoTime();
        DataManipulation dataManipulation = new DataManipulation();
        IndexAccess indexAccess = new IndexAccess(appVariables);
        IndexReader r = indexAccess.reader;
        TermEnum allTerms = r.terms();
        HashMap<DetectionResult, Float> score = new HashMap<>();
        results = FXCollections.observableArrayList();
        int intervalNumber = r.numDocs();
        float intervalDuration = ((float) appVariables.getCurrentDatasetInterval()) / 60;
        int minTermOccur = (int) (minTermSupport * appVariables.nbMessages),
                maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages);
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (term.length() > 1 && !appVariables.isStopWord(term)) {
                TermDocs termDocs = r.termDocs(allTerms.term());
                float frequency[] = indexAccess.getTermFrequency(appVariables, termDocs);
                float cf = frequency[intervalNumber];
                if (cf > minTermOccur && cf < maxTermOccur) {
                    float tf = 0;
                    int maxDoc = 0;
                    for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) {
                        if (frequency[i] > tf) {
                            tf = frequency[i];
                            maxDoc = i;
                        }
                    }
                    float prePeakntf = 0;
                    for (int i = 0; i < maxDoc - 1; i++) {
                        prePeakntf += frequency[i] / cf;
                    }
                    prePeakntf = prePeakntf / (maxDoc - 1);
                    float postPeaskntf = 0;
                    for (int i = maxDoc + 1; i < intervalNumber; i++) {
                        postPeaskntf += frequency[i] / cf;
                    }
                    postPeaskntf = postPeaskntf / (intervalNumber - maxDoc);
                    if (prePeakntf > 0) {
                        float peakDay = (maxDoc * intervalDuration) / 24;
                        float peakDay1 = ((maxDoc + 1) * intervalDuration) / 24;
                        score.put(
                                new DetectionResult(term,
                                        formatter.format(peakDay) + ";" + formatter.format(peakDay1)),
                                postPeaskntf / prePeakntf);
                    }
                }
            }
        }
        score = Collection.getSortedMapDesc(score);
        for (Map.Entry<DetectionResult, Float> entry : score.entrySet()) {
            DetectionResult key = entry.getKey();
            results.add(0, key);
        }
        indexAccess.close();
        long endNanoTime = System.nanoTime();
        long elapsedNanoTime = endNanoTime - startNanoTime;
        double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000;
        appVariables.addLogEntry("[event detection] computed persistent conversations, minTermSupport="
                + minTermSupport + ", maxTermSupport=" + maxTermSupport + ". " + results.size() + " results in "
                + formatter.format(elaspedSecondTime) + "s");
        return results;
    } catch (IOException ex) {
        Logger.getLogger(PersistentConversations.class.getName()).log(Level.SEVERE, null, ex);
        return null;
    }
}

From source file:fr.ericlab.sondy.algo.eventdetection.TrendingScore.java

License:Open Source License

public ObservableList<DetectionResult> apply() {
    try {/*from  w  w  w. j  av a2s . c  om*/
        if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) {
            minTermSupport = Double.parseDouble(parameters.get(0).getValue());
        }
        if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) {
            maxTermSupport = Double.parseDouble(parameters.get(1).getValue());
        }
        if (parameters.get(2).getValue() != null && !parameters.get(2).getValue().equals("")) {
            trendingThreshold = Double.parseDouble(parameters.get(2).getValue());
        }
        long startNanoTime = System.nanoTime();
        IndexAccess indexAccess = new IndexAccess(appVariables);
        IndexReader r = indexAccess.reader;
        TermEnum allTerms = r.terms();
        HashMap<DetectionResult, Float> score = new HashMap<>();
        int intervalNumber = r.numDocs();
        float intervalDuration = ((float) appVariables.getCurrentDatasetInterval()) / 60;
        int minTermOccur = (int) (minTermSupport * appVariables.nbMessages),
                maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages);
        int[] nbWordsPerDoc = new int[r.numDocs()];
        for (int luceneId : appVariables.globalIdMap.keySet()) {
            int sliceId = appVariables.globalIdMap.get(luceneId);
            Document doc = r.document(luceneId);
            String content = doc.get("content");
            int count = 0;
            for (int i = 0; i < content.length(); i++) {
                if (Character.isWhitespace(content.charAt(i)))
                    count++;
            }
            nbWordsPerDoc[sliceId] = count;
        }
        while (allTerms.next()) {
            String term = allTerms.term().text();
            if (term.length() > 1 && !appVariables.isStopWord(term)) {
                TermDocs termDocs = r.termDocs(allTerms.term());
                float frequency[] = indexAccess.getTermFrequency(appVariables, termDocs);
                float cf = frequency[intervalNumber];
                if (cf > minTermOccur && cf < maxTermOccur) {
                    double[] tfnorm = new double[intervalNumber];
                    double tfnormTotal = 0;
                    double[] trendingScore = new double[intervalNumber];
                    for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) {
                        tfnorm[i] = (frequency[i] / nbWordsPerDoc[i]) * Math.pow(10, 6);
                        tfnormTotal += tfnorm[i];
                    }
                    for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) {
                        trendingScore[i] = tfnorm[i] / ((tfnormTotal - tfnorm[i]) / (intervalNumber - 1));
                        if (trendingScore[i] > trendingThreshold) {
                            float dayS = (i * intervalDuration) / 24;
                            float dayE = ((i + 1) * intervalDuration) / 24;
                            score.put(
                                    new DetectionResult(term,
                                            formatter.format(dayS) + ";" + formatter.format(dayE)),
                                    (float) trendingScore[i]);
                        }
                    }
                }
            }
        }
        indexAccess.close();
        score = Collection.getSortedMapDesc(score);
        Set<Map.Entry<DetectionResult, Float>> entrySet = score.entrySet();
        results = FXCollections.observableArrayList();
        for (Map.Entry<DetectionResult, Float> entry : entrySet) {
            results.add(0, entry.getKey());
        }
        long endNanoTime = System.nanoTime();
        long elapsedNanoTime = endNanoTime - startNanoTime;
        double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000;
        appVariables.addLogEntry("[event detection] computed trending scores, minTermSupport=" + minTermSupport
                + ", maxTermSupport=" + maxTermSupport + ", trendingThreshold=" + trendingThreshold + ". "
                + results.size() + " results in " + formatter.format(elaspedSecondTime) + "s");
        return results;
    } catch (IOException ex) {
        Logger.getLogger(PeakyTopics.class.getName()).log(Level.SEVERE, null, ex);
        return null;
    }
}

From source file:fr.ericlab.sondy.ui.EventsUI.java

License:Open Source License

/**
 *
 * @param appVariables/*from  ww w.  j  a v a2s . c o  m*/
 */
public void updateSimpleChart(AppVariables appVariables) {
    if (appVariables.selectedTerm != null) {
        String split[] = appVariables.selectedTerm.split(" ");
        rectangleSelection.setWidth(0);
        DataManipulation dataManipulation = new DataManipulation();
        xAxis.setUpperBound(appVariables.streamDuration);
        XYChart.Series seriesFreq = new XYChart.Series();
        seriesFreq.setName(split[0]);
        XYChart.Series seriesSmooth = new XYChart.Series();
        IndexAccess indexAccess = new IndexAccess(appVariables);
        IndexReader r = indexAccess.reader;
        int numDocs = r.numDocs();
        float[] termFrequency = indexAccess.getTermFrequency(appVariables, split[0]);
        float[] smoothedTermFrequency = dataManipulation.getSmoothedTermFrequency(termFrequency, numDocs / 35);
        indexAccess.close();
        for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) {
            float day = ((i * ((float) appVariables.intervalDurationMin)) / 60) / 24;
            seriesFreq.getData().add(new XYChart.Data(day, termFrequency[i]));
            seriesSmooth.getData().add(new XYChart.Data(day, smoothedTermFrequency[i]));
        }
        if (!chart.getData().isEmpty()) {
            chart.getData().remove(0, chart.getData().size());
        }
        chart.getData().add(seriesFreq);
        chart.getData().add(seriesSmooth);
    } else {
        System.out.println("updateSimpleChart: no term selected " + appVariables.selectedTerm);
    }
}

From source file:fr.ericlab.sondy.ui.EventsUI.java

License:Open Source License

public void updateMentionChart(AppVariables appVariables) {
    if (appVariables.selectedTerm != null) {
        String split[] = appVariables.selectedTerm.split(" ");
        rectangleSelection.setWidth(0);//from   w  w  w.j  ava 2  s  .co m
        DataManipulation dataManipulation = new DataManipulation();
        xAxis.setUpperBound(appVariables.streamDuration);
        XYChart.Series seriesFreq = new XYChart.Series();
        seriesFreq.setName(split[0]);
        XYChart.Series seriesSmooth = new XYChart.Series();
        MentionIndexAccess indexAccess = new MentionIndexAccess(appVariables);
        IndexReader r = indexAccess.mentionReader;
        int numDocs = r.numDocs();
        float[] termFrequency = indexAccess.getTermFrequency(appVariables, split[0]);
        float[] smoothedTermFrequency = dataManipulation.getSmoothedTermFrequency(termFrequency, numDocs / 35);
        indexAccess.close();
        for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) {
            float day = ((i * ((float) appVariables.intervalDurationMin)) / 60) / 24;
            seriesFreq.getData().add(new XYChart.Data(day, termFrequency[i]));
            seriesSmooth.getData().add(new XYChart.Data(day, smoothedTermFrequency[i]));
        }
        if (!chart.getData().isEmpty()) {
            chart.getData().remove(2, chart.getData().size());
        }
        chart.getData().add(seriesFreq);
        chart.getData().add(seriesSmooth);
    }
}

From source file:fr.ericlab.sondy.ui.EventsUI.java

License:Open Source License

/**
 *
 * @param appVariables/*from w w  w.j  a va2  s .  c  om*/
 * @param compareToTerm
 */
public void updateComparisonChart(AppVariables appVariables, String compareToTerm) {
    if (compareToTerm != null) {
        DataManipulation dataManipulation = new DataManipulation();
        xAxis.setUpperBound(appVariables.streamDuration);
        XYChart.Series seriesFreq = new XYChart.Series();
        seriesFreq.setName(compareToTerm);
        XYChart.Series seriesSmooth = new XYChart.Series();
        IndexAccess indexAccess = new IndexAccess(appVariables);
        IndexReader r = indexAccess.reader;
        int numDocs = r.numDocs();
        float[] termFrequency = indexAccess.getTermFrequency(appVariables, compareToTerm);
        float[] smoothedTermFrequency = dataManipulation.getSmoothedTermFrequency(termFrequency, numDocs / 35);
        indexAccess.close();
        for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) {
            float day = ((i * ((float) appVariables.intervalDurationMin)) / 60) / 24;
            seriesFreq.getData().add(new XYChart.Data(day, termFrequency[i]));
            seriesSmooth.getData().add(new XYChart.Data(day, smoothedTermFrequency[i]));
        }
        if (!chart.getData().isEmpty()) {
            chart.getData().remove(2, chart.getData().size());
        }
        chart.getData().add(seriesFreq);
        chart.getData().add(seriesSmooth);
    }
}

From source file:indexer.Cell.java

boolean toSplit(IndexReader reader) throws Exception {
    Cell parentCell = getCellIdOfParentCell();
    int df = 0;/*w  w  w. j a  va 2s.  c  o m*/
    int numDocs = 0;

    Term parentCellTerm = new Term(DocVector.FIELD_CELL_ID, parentCell.toString());
    Term thisCellTerm = new Term(DocVector.FIELD_CELL_ID, this.toString());

    // Find the number of cells in this strip, e.g.
    // a. if the current cell is 5_2, 
    numDocs = parentCell.validCell() ? reader.docFreq(parentCellTerm) : reader.numDocs();
    df = reader.docFreq(thisCellTerm);

    int uniformCount = numDocs / DocVector.numIntervals;
    return df > uniformCount;
}

From source file:indexer.IndexSplitter.java

public void split() throws Exception {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir));
    final int numDocs = reader.numDocs();
    IndexWriter pWriter; // pointer variable

    for (int i = 0; i < numDocs; i++) {
        Document d = reader.document(i);
        pWriter = d.get(FIELD_CODEMIXED).equals("1") ? mixedIndexWriter : pureIndexWriter;
        pWriter.addDocument(d);/*from   w  ww.  j av a2s . c  o  m*/
    }

    reader.close();
    pureIndexWriter.close();
    mixedIndexWriter.close();
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testReadingPostings() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    assertEquals(3, reader.numDocs());
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);//from  ww w .  ja va 2  s . c o m

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testIterateThroughDocumentVector() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);

    int numDocs = reader.numDocs();
    // Iterate through the document vectors
    for (int i = 0; i < numDocs; i++) {
        System.out.println(reader.document(i));
        Terms terms = reader.getTermVector(i, "text");
        TermsEnum te = terms.iterator();

        // For this document, iterate through the terms.
        Term term;//from   w w w.  j a v  a2s .c  o  m
        while (te.next() != null) {
            term = new Term("text", te.term());
            long tf = te.totalTermFreq();
            // Print out the term and its term frequency
            System.out.println(term.bytes().utf8ToString() + " " + tf);
        }
    }
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testIterateThroughDocumentVectorComputeBM25() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setSimilarity(new BM25Similarity());

    int numDocs = reader.numDocs();
    // Iterate through the document vectors
    for (int i = 0; i < numDocs; i++) {
        String docid = reader.document(i).getField("docid").stringValue();
        System.out.println(reader.document(i));
        System.out.println(i + ": " + docid);
        Terms terms = reader.getTermVector(i, "text");
        TermsEnum te = terms.iterator();

        // For this document, iterate through the terms.
        while (te.next() != null) {
            String term = new Term("text", te.term()).bytes().utf8ToString();
            long tf = te.totalTermFreq();

            // The way to compute the BM25 score is to issue a query with the exact docid and the
            // term in question, and look at the retrieval score.
            Query filterQuery = new TermQuery(new Term("docid", docid)); // the docid
            Query termQuery = new TermQuery(new Term("text", term)); // the term
            BooleanQuery.Builder builder = new BooleanQuery.Builder(); // must have both
            builder.add(filterQuery, BooleanClause.Occur.MUST);
            builder.add(termQuery, BooleanClause.Occur.MUST);
            Query finalQuery = builder.build();
            TopDocs rs = searcher.search(finalQuery, 1); // issue the query

            // The BM25 weight is the maxScore
            System.out.println(term + " " + tf + " " + rs.getMaxScore());
        }//from   ww w .j av a2s . co  m
    }
}