List of usage examples for org.apache.lucene.index IndexReader maxDoc
public abstract int maxDoc();
From source file:com.mmiagency.knime.nodes.keyworddensity.util.KeywordDensityHelper.java
License:Open Source License
public void execute() throws IOException { org.jsoup.nodes.Document jdoc = null; // pull content using Jsoup if (m_content != null && !m_content.trim().isEmpty()) { jdoc = Jsoup.parse(m_content);//from w ww.j a v a 2 s. co m } else { Connection conn = Jsoup.connect(m_url); conn.validateTLSCertificates(false); conn.followRedirects(true); conn.userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0"); conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); conn.header("Accept-Language", "en-US,en;q=0.5"); conn.header("Accept-Encoding", "gzip, deflate"); conn.execute(); jdoc = conn.get(); } StringWriter text = new StringWriter(); if (m_includeMetaKeywords) { text.write(jdoc.select("meta[name=keywords]").attr("content")); text.write(" "); } if (m_includeMetaDescription) { text.write(jdoc.select("meta[name=description]").attr("content")); text.write(" "); } if (m_includePageTitle) { text.write(jdoc.select("title").text()); text.write(" "); } text.write(jdoc.select("body").text()); // analyze content with Lucene StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); Directory directory = new RAMDirectory(); IndexWriter indexWriter = new IndexWriter(directory, analyzer, MaxFieldLength.LIMITED); Document doc = new Document(); Field textField = new Field("content", text.toString(), Field.Store.YES, Field.Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS); doc.add(textField); indexWriter.addDocument(doc); indexWriter.commit(); indexWriter.close(); IndexReader indexReader = IndexReader.open(directory, true); TermFreqVector termFreqVector = null; for (int i = 0; i < indexReader.maxDoc(); i++) { termFreqVector = indexReader.getTermFreqVector(i, "content"); String[] terms = termFreqVector.getTerms(); int[] freqs = termFreqVector.getTermFrequencies(); for (int n = 0; n < termFreqVector.size(); n++) { if (m_excludeList.contains(terms[n])) { continue; } add(terms[n], freqs[n]); } } indexReader.close(); directory.close(); // sort map by value sortMap(); }
From source file:com.nearinfinity.blur.manager.IndexManager.java
License:Apache License
public static void fetchRow(IndexReader reader, String table, Selector selector, FetchResult fetchResult) throws CorruptIndexException, IOException { fetchResult.table = table;// ww w . j a va 2s . c o m String locationId = selector.locationId; int lastSlash = locationId.lastIndexOf('/'); int docId = Integer.parseInt(locationId.substring(lastSlash + 1)); if (docId >= reader.maxDoc()) { throw new RuntimeException("Location id [" + locationId + "] with docId [" + docId + "] is not valid."); } if (selector.isRecordOnly()) { // select only the row for the given data or location id. if (reader.isDeleted(docId)) { fetchResult.exists = false; fetchResult.deleted = true; return; } else { fetchResult.exists = true; fetchResult.deleted = false; Document document = reader.document(docId, getFieldSelector(selector)); fetchResult.recordResult = getColumns(document); return; } } else { if (reader.isDeleted(docId)) { fetchResult.exists = false; fetchResult.deleted = true; return; } else { fetchResult.exists = true; fetchResult.deleted = false; String rowId = getRowId(reader, docId); TermDocs termDocs = reader.termDocs(new Term(ROW_ID, rowId)); fetchResult.rowResult = new FetchRowResult( getRow(new TermDocIterable(termDocs, reader, getFieldSelector(selector)))); return; } } }
From source file:com.nearinfinity.blur.utils.PrimeDocCache.java
License:Apache License
/** * The way this method is called via warm up methods the likelihood of * creating multiple bitsets during a race condition is very low, that's why * this method is not synced./*from w w w . ja v a 2 s . c o m*/ */ public static OpenBitSet getPrimeDocBitSet(IndexReader reader) throws IOException { Object key = reader.getCoreCacheKey(); OpenBitSet bitSet = primeDocMap.get(key); if (bitSet == null) { reader.addReaderClosedListener(new ReaderClosedListener() { @Override public void onClose(IndexReader reader) { Object key = reader.getCoreCacheKey(); LOG.debug("Current size [" + primeDocMap.size() + "] Prime Doc BitSet removing for segment [" + reader + "]"); primeDocMap.remove(key); } }); LOG.debug("Prime Doc BitSet missing for segment [" + reader + "] current size [" + primeDocMap.size() + "]"); bitSet = new OpenBitSet(reader.maxDoc()); primeDocMap.put(key, bitSet); TermDocs termDocs = reader.termDocs(BlurConstants.PRIME_DOC_TERM); while (termDocs.next()) { bitSet.set(termDocs.doc()); } termDocs.close(); } return bitSet; }
From source file:com.o19s.solr.swan.highlight.SpanAwareFieldTermStack.java
License:Apache License
/** * a constructor.//from www . j a v a 2s . c o m * * @param reader IndexReader of the index * @param docId document id to be highlighted * @param fieldName field of the document to be highlighted * @param fieldQuery FieldQuery object * @throws IOException If there is a low-level I/O error */ public SpanAwareFieldTermStack(IndexReader reader, int docId, String fieldName, final SpanAwareFieldQuery fieldQuery) throws IOException { this.fieldName = fieldName; Set<String> termSet = fieldQuery.getTermSet(fieldName); Set<String> alwaysHighlightTermSet = fieldQuery.getHighlightTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet == null) return; final Fields vectors = reader.getTermVectors(docId); if (vectors == null) { // null snippet return; } final Terms vector = vectors.terms(fieldName); if (vector == null) { // null snippet return; } final CharsRef spare = new CharsRef(); final TermsEnum termsEnum = vector.iterator(null); DocsAndPositionsEnum dpEnum = null; BytesRef text; int numDocs = reader.maxDoc(); while ((text = termsEnum.next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); final String term = spare.toString(); if (!termSet.contains(term)) { continue; } dpEnum = termsEnum.docsAndPositions(null, dpEnum); if (dpEnum == null) { // null snippet return; } dpEnum.nextDoc(); // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html final float weight = (float) (Math .log(numDocs / (double) (reader.docFreq(new Term(fieldName, text)) + 1)) + 1.0); final int freq = dpEnum.freq(); for (int i = 0; i < freq; i++) { int pos = dpEnum.nextPosition(); if (dpEnum.startOffset() < 0) { return; // no offsets, null snippet } if (alwaysHighlightTermSet.contains(term) || fieldQuery.doesDocFieldContainPosition(fieldName, docId, dpEnum.startOffset())) { termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight)); } } } // sort by position Collections.sort(termList); }
From source file:com.pjaol.lucene.search.SerialChainFilter.java
License:Apache License
@Override // public BitSet bits(IndexReader reader) throws CorruptIndexException, IOException { public BitSet bits(IndexReader reader) throws IOException { if (chain.length == 0) { BitSet bits = new BitSet(reader.maxDoc()); for (int i = 0; i < bits.size(); i++) bits.set(i);/*from w ww . j a va 2 s .c o m*/ return bits; } BitSet bits = new BitSet(reader.maxDoc()); int chainSize = chain.length; int actionSize = actionType.length; int i = 0; /** * taken from ChainedFilter, first and on an empty bitset results in 0 */ if (actionType[i] == AND) { try { bits = (BitSet) chain[i].bits(reader).clone(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } ++i; } for (; i < chainSize; i++) { int action = (i < actionSize) ? actionType[i] : DEFAULT; switch (action) { case (SERIALAND): try { bits.and(((ISerialChainFilter) chain[i]).bits(reader, bits)); // } catch (CorruptIndexException e) { // // TODO Auto-generated catch block // e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } break; case (SERIALOR): try { bits.or(((ISerialChainFilter) chain[i]).bits(reader, bits)); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } break; case (AND): bits.and(chain[i].bits(reader)); break; case (OR): bits.or(chain[i].bits(reader)); break; } } System.out.println("Filter: " + this.toString() + " returns:" + bits.cardinality() + " docs"); return bits; }
From source file:com.redhat.satellite.search.index.IndexManager.java
License:Open Source License
/** * Removes any documents which are not related to the passed in Set of good value * @param ids Set of ids of all known/good values * @param indexName index name to operate on * @param uniqField the name of the field in the Document to uniquely identify * this record/*from w w w. ja va2s . c o m*/ * @return the number of documents deleted */ public int deleteRecordsNotInList(Set<String> ids, String indexName, String uniqField) { int count = 0; IndexReader reader = null; try { reader = getIndexReader(indexName, IndexHandler.DEFAULT_LANG); // Use maxDoc() to iterate over all docs, numDocs() returns the // number of currently alive docs leaving out the deleted ones. int maxDoc = reader.maxDoc(); for (int i = 0; i < maxDoc; i++) { if (!reader.isDeleted(i)) { Document doc = reader.document(i); String uniqId = doc.getField(uniqField).stringValue(); if (!ids.contains(uniqId)) { log.info(indexName + ":" + uniqField + ": <" + uniqId + "> not found in list of current/good values " + "assuming this has been deleted from Database and we " + "should remove it."); removeFromIndex(indexName, uniqField, uniqId); count++; } } } } catch (IOException e) { e.printStackTrace(); log.info("deleteRecordsNotInList() caught exception : " + e); } catch (IndexingException e) { e.printStackTrace(); log.info("deleteRecordsNotInList() caught exception : " + e); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { // } } } return count; }
From source file:com.searchbox.SuggeterDataStructureBuilder.java
License:Apache License
private void iterateThroughDocuments(SolrIndexSearcher searcher, String[] fields, int maxNumDocs) { IndexReader reader = searcher.getIndexReader(); // WARNING: returns null if there are no deletions Bits liveDocs = MultiFields.getLiveDocs(reader); maxNumDocs = Math.min(maxNumDocs, reader.maxDoc()); if (maxNumDocs == -1) { maxNumDocs = reader.maxDoc();//from ww w . j a v a2 s .co m } LOGGER.info("Analyzing docs:\t" + numdocs); for (int docID = 0; docID < reader.maxDoc(); docID++) { if (numdocs > maxNumDocs) { break; } if (liveDocs != null && !liveDocs.get(docID)) { continue; // deleted } if ((docID % 1000) == 0) { LOGGER.debug("Doing " + docID + " of " + maxNumDocs); } StringBuilder text = new StringBuilder(); for (String field : fields) { /* * not sure if this is the best way, might make sense to do a * process text for each field individually, but then book * keeping the doc freq for terms becomes a bit of a pain in the * ass */ try { IndexableField[] multifield = reader.document(docID).getFields(field); for (IndexableField singlefield : multifield) { // create one big string from all of the text in the // documents for processing later on text.append(". " + singlefield.stringValue()); } } catch (IOException ex) { LOGGER.warn("Document " + docID + " missing requested field (" + field + ")...ignoring"); } } // might as well see if its empty if (text.length() > 0) { // actually processes the massive string which was created from // all of the above fields processText(text.toString().toLowerCase()); numdocs++; } } LOGGER.info("Number of documents analyzed: \t" + numdocs); for (int zz = 0; zz < counts.length; zz++) { LOGGER.info("Number of " + zz + "-grams: \t" + counts[zz]); } }
From source file:com.searchbox.Tagger.java
License:Apache License
private void DfCountBuilder(SolrIndexSearcher searcher, String[] fields, int maxNumDocs) { IndexReader reader = searcher.getIndexReader(); Bits liveDocs = MultiFields.getLiveDocs(reader); // WARNING: returns null if // there are no deletions maxNumDocs = Math.min(maxNumDocs, reader.maxDoc()); if (maxNumDocs == -1) { maxNumDocs = reader.maxDoc();/* w ww .j ava 2s .c om*/ } LOGGER.info("Analyzing docs:\t" + numdocs); for (int docID = 0; docID < reader.maxDoc(); docID++) { if (numdocs > maxNumDocs) { break; } if (liveDocs != null && !liveDocs.get(docID)) { continue; // deleted } if ((docID % 1000) == 0) { LOGGER.debug("Doing " + docID + " of " + maxNumDocs); } StringBuilder text = new StringBuilder(); for (String field : fields) { // not sure if this is the best way, might // make sense to do a // process text for each field individually, but then book keeping // the doc freq for terms becomes a bit of a pain in the ass try { text.append(". " + reader.document(docID).get(field)); } catch (IOException ex) { LOGGER.warn("Document " + docID + " missing requested field (" + field + ")...ignoring"); } } if (text.length() > 0) { // might as well see if its empty processDocText(text.toString()); numdocs++; } } LOGGER.info("Number of documents analyzed: \t" + numdocs); dfcounts.put(DOC_COUNTS_STRING, numdocs); tfcounts.put(DOC_COUNTS_STRING, numdocs); }
From source file:com.sxc.lucene.index.IndexingTest.java
License:Apache License
public void testIndexReader() throws IOException { IndexReader reader = DirectoryReader.open(directory); assertEquals(ids.length, reader.maxDoc()); // 8 assertEquals(ids.length, reader.numDocs()); // 8 reader.close();/*from www . ja v a 2 s . c om*/ }
From source file:com.tamingtext.classifier.bayes.ExtractTrainingData.java
License:Apache License
/** * Extract training data from a lucene index. * <p>// ww w.ja va 2s. co m * Iterates over documents in the lucene index, the values in the categoryFields are inspected and if found to * contain any of the strings found in the category file, a training data item will be emitted, assigned to the * matching category and containing the terms found in the fields listed in textFields. Output is written to * the output directory with one file per category. * <p> * The category file contains one line per category, each line contains a number of whitespace delimited strings. * The first string on each line is the category name, while subsequent strings will be used to identify documents * that belong in that category. * <p> * 'Technology Computers Macintosh' will cause documents that contain either 'Technology', 'Computers' or 'Machintosh' * in one of their categoryFields to be assigned to the 'Technology' category. * * * @param indexDir * directory of lucene index to extract from * * @param maxDocs * the maximum number of documents to process. * * @param categoryFile * file containing category strings to extract * * @param categoryFields * list of fields to match against category data * * @param textFields * list of fields containing terms to extract * * @param outputDir * directory to write output to * * @throws IOException */ public static void extractTraininingData(File indexDir, File categoryFile, Collection<String> categoryFields, Collection<String> textFields, File outputDir, boolean useTermVectors) throws IOException { log.info("Index dir: " + indexDir); log.info("Category file: " + categoryFile); log.info("Output dir: " + outputDir); log.info("Category fields: " + categoryFields.toString()); log.info("Text fields: " + textFields.toString()); log.info("Use Term Vectors?: " + useTermVectors); OpenObjectIntHashMap<String> categoryCounts = new OpenObjectIntHashMap<String>(); Map<String, List<String>> categories = readCategoryFile(categoryFile); Directory dir = FSDirectory.open(indexDir); IndexReader reader = IndexReader.open(dir, true); int max = reader.maxDoc(); StringBuilder buf = new StringBuilder(); for (int i = 0; i < max; i++) { if (!reader.isDeleted(i)) { Document d = reader.document(i); String category = null; // determine whether any of the fields in this document contain a // category in the category list fields: for (String field : categoryFields) { for (Field f : d.getFields(field)) { if (f.isStored() && !f.isBinary()) { String fieldValue = f.stringValue().toLowerCase(); for (String cat : categories.keySet()) { List<String> cats = categories.get(cat); for (String c : cats) { if (fieldValue.contains(c)) { category = cat; break fields; } } } } } } if (category == null) continue; // append the terms from each of the textFields to the training data for this document. buf.setLength(0); for (String field : textFields) { if (useTermVectors) { appendVectorTerms(buf, reader.getTermFreqVector(i, field)); } else { appendFieldText(buf, d.getField(field)); } } getWriterForCategory(outputDir, category).printf("%s\t%s\n", category, buf.toString()); categoryCounts.adjustOrPutValue(category, 1, 1); } } if (log.isInfoEnabled()) { StringBuilder b = new StringBuilder(); b.append("\nCatagory document counts:\n"); LinkedList<String> keyList = new LinkedList<String>(); categoryCounts.keysSortedByValue(keyList); String key; while (!keyList.isEmpty()) { key = keyList.removeLast(); b.append(categoryCounts.get(key)).append('\t').append(key).append('\n'); } log.info(b.toString()); } }