List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:fr.ericlab.sondy.algo.eventdetection.PersistentConversations.java
License:Open Source License
public ObservableList<DetectionResult> apply() { try {//w w w .j a v a 2 s . c o m if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) { minTermSupport = Double.parseDouble(parameters.get(0).getValue()); } if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) { maxTermSupport = Double.parseDouble(parameters.get(1).getValue()); } long startNanoTime = System.nanoTime(); DataManipulation dataManipulation = new DataManipulation(); IndexAccess indexAccess = new IndexAccess(appVariables); IndexReader r = indexAccess.reader; TermEnum allTerms = r.terms(); HashMap<DetectionResult, Float> score = new HashMap<>(); results = FXCollections.observableArrayList(); int intervalNumber = r.numDocs(); float intervalDuration = ((float) appVariables.getCurrentDatasetInterval()) / 60; int minTermOccur = (int) (minTermSupport * appVariables.nbMessages), maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages); while (allTerms.next()) { String term = allTerms.term().text(); if (term.length() > 1 && !appVariables.isStopWord(term)) { TermDocs termDocs = r.termDocs(allTerms.term()); float frequency[] = indexAccess.getTermFrequency(appVariables, termDocs); float cf = frequency[intervalNumber]; if (cf > minTermOccur && cf < maxTermOccur) { float tf = 0; int maxDoc = 0; for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) { if (frequency[i] > tf) { tf = frequency[i]; maxDoc = i; } } float prePeakntf = 0; for (int i = 0; i < maxDoc - 1; i++) { prePeakntf += frequency[i] / cf; } prePeakntf = prePeakntf / (maxDoc - 1); float postPeaskntf = 0; for (int i = maxDoc + 1; i < intervalNumber; i++) { postPeaskntf += frequency[i] / cf; } postPeaskntf = postPeaskntf / (intervalNumber - maxDoc); if (prePeakntf > 0) { float peakDay = (maxDoc * intervalDuration) / 24; float peakDay1 = ((maxDoc + 1) * intervalDuration) / 24; score.put( new DetectionResult(term, formatter.format(peakDay) + ";" + formatter.format(peakDay1)), postPeaskntf / prePeakntf); } } } } score = Collection.getSortedMapDesc(score); for (Map.Entry<DetectionResult, Float> entry : score.entrySet()) { DetectionResult key = entry.getKey(); results.add(0, key); } indexAccess.close(); long endNanoTime = System.nanoTime(); long elapsedNanoTime = endNanoTime - startNanoTime; double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000; appVariables.addLogEntry("[event detection] computed persistent conversations, minTermSupport=" + minTermSupport + ", maxTermSupport=" + maxTermSupport + ". " + results.size() + " results in " + formatter.format(elaspedSecondTime) + "s"); return results; } catch (IOException ex) { Logger.getLogger(PersistentConversations.class.getName()).log(Level.SEVERE, null, ex); return null; } }
From source file:fr.ericlab.sondy.algo.eventdetection.TrendingScore.java
License:Open Source License
public ObservableList<DetectionResult> apply() { try {/*from w w w. j av a2s . c om*/ if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) { minTermSupport = Double.parseDouble(parameters.get(0).getValue()); } if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) { maxTermSupport = Double.parseDouble(parameters.get(1).getValue()); } if (parameters.get(2).getValue() != null && !parameters.get(2).getValue().equals("")) { trendingThreshold = Double.parseDouble(parameters.get(2).getValue()); } long startNanoTime = System.nanoTime(); IndexAccess indexAccess = new IndexAccess(appVariables); IndexReader r = indexAccess.reader; TermEnum allTerms = r.terms(); HashMap<DetectionResult, Float> score = new HashMap<>(); int intervalNumber = r.numDocs(); float intervalDuration = ((float) appVariables.getCurrentDatasetInterval()) / 60; int minTermOccur = (int) (minTermSupport * appVariables.nbMessages), maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages); int[] nbWordsPerDoc = new int[r.numDocs()]; for (int luceneId : appVariables.globalIdMap.keySet()) { int sliceId = appVariables.globalIdMap.get(luceneId); Document doc = r.document(luceneId); String content = doc.get("content"); int count = 0; for (int i = 0; i < content.length(); i++) { if (Character.isWhitespace(content.charAt(i))) count++; } nbWordsPerDoc[sliceId] = count; } while (allTerms.next()) { String term = allTerms.term().text(); if (term.length() > 1 && !appVariables.isStopWord(term)) { TermDocs termDocs = r.termDocs(allTerms.term()); float frequency[] = indexAccess.getTermFrequency(appVariables, termDocs); float cf = frequency[intervalNumber]; if (cf > minTermOccur && cf < maxTermOccur) { double[] tfnorm = new double[intervalNumber]; double tfnormTotal = 0; double[] trendingScore = new double[intervalNumber]; for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) { tfnorm[i] = (frequency[i] / nbWordsPerDoc[i]) * Math.pow(10, 6); tfnormTotal += tfnorm[i]; } for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) { trendingScore[i] = tfnorm[i] / ((tfnormTotal - tfnorm[i]) / (intervalNumber - 1)); if (trendingScore[i] > trendingThreshold) { float dayS = (i * intervalDuration) / 24; float dayE = ((i + 1) * intervalDuration) / 24; score.put( new DetectionResult(term, formatter.format(dayS) + ";" + formatter.format(dayE)), (float) trendingScore[i]); } } } } } indexAccess.close(); score = Collection.getSortedMapDesc(score); Set<Map.Entry<DetectionResult, Float>> entrySet = score.entrySet(); results = FXCollections.observableArrayList(); for (Map.Entry<DetectionResult, Float> entry : entrySet) { results.add(0, entry.getKey()); } long endNanoTime = System.nanoTime(); long elapsedNanoTime = endNanoTime - startNanoTime; double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000; appVariables.addLogEntry("[event detection] computed trending scores, minTermSupport=" + minTermSupport + ", maxTermSupport=" + maxTermSupport + ", trendingThreshold=" + trendingThreshold + ". " + results.size() + " results in " + formatter.format(elaspedSecondTime) + "s"); return results; } catch (IOException ex) { Logger.getLogger(PeakyTopics.class.getName()).log(Level.SEVERE, null, ex); return null; } }
From source file:fr.ericlab.sondy.ui.EventsUI.java
License:Open Source License
/** * * @param appVariables/*from ww w. j a v a2s . c o m*/ */ public void updateSimpleChart(AppVariables appVariables) { if (appVariables.selectedTerm != null) { String split[] = appVariables.selectedTerm.split(" "); rectangleSelection.setWidth(0); DataManipulation dataManipulation = new DataManipulation(); xAxis.setUpperBound(appVariables.streamDuration); XYChart.Series seriesFreq = new XYChart.Series(); seriesFreq.setName(split[0]); XYChart.Series seriesSmooth = new XYChart.Series(); IndexAccess indexAccess = new IndexAccess(appVariables); IndexReader r = indexAccess.reader; int numDocs = r.numDocs(); float[] termFrequency = indexAccess.getTermFrequency(appVariables, split[0]); float[] smoothedTermFrequency = dataManipulation.getSmoothedTermFrequency(termFrequency, numDocs / 35); indexAccess.close(); for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) { float day = ((i * ((float) appVariables.intervalDurationMin)) / 60) / 24; seriesFreq.getData().add(new XYChart.Data(day, termFrequency[i])); seriesSmooth.getData().add(new XYChart.Data(day, smoothedTermFrequency[i])); } if (!chart.getData().isEmpty()) { chart.getData().remove(0, chart.getData().size()); } chart.getData().add(seriesFreq); chart.getData().add(seriesSmooth); } else { System.out.println("updateSimpleChart: no term selected " + appVariables.selectedTerm); } }
From source file:fr.ericlab.sondy.ui.EventsUI.java
License:Open Source License
public void updateMentionChart(AppVariables appVariables) { if (appVariables.selectedTerm != null) { String split[] = appVariables.selectedTerm.split(" "); rectangleSelection.setWidth(0);//from w w w.j ava 2 s .co m DataManipulation dataManipulation = new DataManipulation(); xAxis.setUpperBound(appVariables.streamDuration); XYChart.Series seriesFreq = new XYChart.Series(); seriesFreq.setName(split[0]); XYChart.Series seriesSmooth = new XYChart.Series(); MentionIndexAccess indexAccess = new MentionIndexAccess(appVariables); IndexReader r = indexAccess.mentionReader; int numDocs = r.numDocs(); float[] termFrequency = indexAccess.getTermFrequency(appVariables, split[0]); float[] smoothedTermFrequency = dataManipulation.getSmoothedTermFrequency(termFrequency, numDocs / 35); indexAccess.close(); for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) { float day = ((i * ((float) appVariables.intervalDurationMin)) / 60) / 24; seriesFreq.getData().add(new XYChart.Data(day, termFrequency[i])); seriesSmooth.getData().add(new XYChart.Data(day, smoothedTermFrequency[i])); } if (!chart.getData().isEmpty()) { chart.getData().remove(2, chart.getData().size()); } chart.getData().add(seriesFreq); chart.getData().add(seriesSmooth); } }
From source file:fr.ericlab.sondy.ui.EventsUI.java
License:Open Source License
/** * * @param appVariables/*from w w w.j a va2 s . c om*/ * @param compareToTerm */ public void updateComparisonChart(AppVariables appVariables, String compareToTerm) { if (compareToTerm != null) { DataManipulation dataManipulation = new DataManipulation(); xAxis.setUpperBound(appVariables.streamDuration); XYChart.Series seriesFreq = new XYChart.Series(); seriesFreq.setName(compareToTerm); XYChart.Series seriesSmooth = new XYChart.Series(); IndexAccess indexAccess = new IndexAccess(appVariables); IndexReader r = indexAccess.reader; int numDocs = r.numDocs(); float[] termFrequency = indexAccess.getTermFrequency(appVariables, compareToTerm); float[] smoothedTermFrequency = dataManipulation.getSmoothedTermFrequency(termFrequency, numDocs / 35); indexAccess.close(); for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) { float day = ((i * ((float) appVariables.intervalDurationMin)) / 60) / 24; seriesFreq.getData().add(new XYChart.Data(day, termFrequency[i])); seriesSmooth.getData().add(new XYChart.Data(day, smoothedTermFrequency[i])); } if (!chart.getData().isEmpty()) { chart.getData().remove(2, chart.getData().size()); } chart.getData().add(seriesFreq); chart.getData().add(seriesSmooth); } }
From source file:indexer.Cell.java
boolean toSplit(IndexReader reader) throws Exception { Cell parentCell = getCellIdOfParentCell(); int df = 0;/*w w w. j a va 2s. c o m*/ int numDocs = 0; Term parentCellTerm = new Term(DocVector.FIELD_CELL_ID, parentCell.toString()); Term thisCellTerm = new Term(DocVector.FIELD_CELL_ID, this.toString()); // Find the number of cells in this strip, e.g. // a. if the current cell is 5_2, numDocs = parentCell.validCell() ? reader.docFreq(parentCellTerm) : reader.numDocs(); df = reader.docFreq(thisCellTerm); int uniformCount = numDocs / DocVector.numIntervals; return df > uniformCount; }
From source file:indexer.IndexSplitter.java
public void split() throws Exception { IndexReader reader = DirectoryReader.open(FSDirectory.open(indexDir)); final int numDocs = reader.numDocs(); IndexWriter pWriter; // pointer variable for (int i = 0; i < numDocs; i++) { Document d = reader.document(i); pWriter = d.get(FIELD_CODEMIXED).equals("1") ? mixedIndexWriter : pureIndexWriter; pWriter.addDocument(d);/*from w ww. j av a2s . c o m*/ } reader.close(); pureIndexWriter.close(); mixedIndexWriter.close(); }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testReadingPostings() throws Exception { Directory dir = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir); assertEquals(3, reader.numDocs()); assertEquals(1, reader.leaves().size()); System.out.println("Dumping out postings..."); dumpPostings(reader);//from ww w . ja va 2 s . c o m assertEquals(2, reader.docFreq(new Term("text", "here"))); assertEquals(2, reader.docFreq(new Term("text", "more"))); assertEquals(1, reader.docFreq(new Term("text", "some"))); assertEquals(1, reader.docFreq(new Term("text", "test"))); assertEquals(2, reader.docFreq(new Term("text", "text"))); reader.close(); }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testIterateThroughDocumentVector() throws Exception { Directory dir = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir); int numDocs = reader.numDocs(); // Iterate through the document vectors for (int i = 0; i < numDocs; i++) { System.out.println(reader.document(i)); Terms terms = reader.getTermVector(i, "text"); TermsEnum te = terms.iterator(); // For this document, iterate through the terms. Term term;//from w w w. j a v a2s .c o m while (te.next() != null) { term = new Term("text", te.term()); long tf = te.totalTermFreq(); // Print out the term and its term frequency System.out.println(term.bytes().utf8ToString() + " " + tf); } } }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testIterateThroughDocumentVectorComputeBM25() throws Exception { Directory dir = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new BM25Similarity()); int numDocs = reader.numDocs(); // Iterate through the document vectors for (int i = 0; i < numDocs; i++) { String docid = reader.document(i).getField("docid").stringValue(); System.out.println(reader.document(i)); System.out.println(i + ": " + docid); Terms terms = reader.getTermVector(i, "text"); TermsEnum te = terms.iterator(); // For this document, iterate through the terms. while (te.next() != null) { String term = new Term("text", te.term()).bytes().utf8ToString(); long tf = te.totalTermFreq(); // The way to compute the BM25 score is to issue a query with the exact docid and the // term in question, and look at the retrieval score. Query filterQuery = new TermQuery(new Term("docid", docid)); // the docid Query termQuery = new TermQuery(new Term("text", term)); // the term BooleanQuery.Builder builder = new BooleanQuery.Builder(); // must have both builder.add(filterQuery, BooleanClause.Occur.MUST); builder.add(termQuery, BooleanClause.Occur.MUST); Query finalQuery = builder.build(); TopDocs rs = searcher.search(finalQuery, 1); // issue the query // The BM25 weight is the maxScore System.out.println(term + " " + tf + " " + rs.getMaxScore()); }//from ww w .j av a2s . co m } }