Example usage for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:com.mmiagency.knime.nodes.keyworddensity.util.KeywordDensityHelper.java

License:Open Source License

public void execute() throws IOException {

    org.jsoup.nodes.Document jdoc = null;

    // pull content using Jsoup 
    if (m_content != null && !m_content.trim().isEmpty()) {
        jdoc = Jsoup.parse(m_content);//from  w ww.j  a v  a 2 s.  co m
    } else {
        Connection conn = Jsoup.connect(m_url);

        conn.validateTLSCertificates(false);
        conn.followRedirects(true);
        conn.userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0");
        conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        conn.header("Accept-Language", "en-US,en;q=0.5");
        conn.header("Accept-Encoding", "gzip, deflate");

        conn.execute();
        jdoc = conn.get();
    }

    StringWriter text = new StringWriter();

    if (m_includeMetaKeywords) {
        text.write(jdoc.select("meta[name=keywords]").attr("content"));
        text.write(" ");
    }
    if (m_includeMetaDescription) {
        text.write(jdoc.select("meta[name=description]").attr("content"));
        text.write(" ");
    }
    if (m_includePageTitle) {
        text.write(jdoc.select("title").text());
        text.write(" ");
    }

    text.write(jdoc.select("body").text());

    // analyze content with Lucene
    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    Directory directory = new RAMDirectory();
    IndexWriter indexWriter = new IndexWriter(directory, analyzer, MaxFieldLength.LIMITED);

    Document doc = new Document();
    Field textField = new Field("content", text.toString(), Field.Store.YES, Field.Index.ANALYZED,
            TermVector.WITH_POSITIONS_OFFSETS);

    doc.add(textField);

    indexWriter.addDocument(doc);
    indexWriter.commit();
    indexWriter.close();

    IndexReader indexReader = IndexReader.open(directory, true);

    TermFreqVector termFreqVector = null;

    for (int i = 0; i < indexReader.maxDoc(); i++) {
        termFreqVector = indexReader.getTermFreqVector(i, "content");

        String[] terms = termFreqVector.getTerms();
        int[] freqs = termFreqVector.getTermFrequencies();

        for (int n = 0; n < termFreqVector.size(); n++) {
            if (m_excludeList.contains(terms[n])) {
                continue;
            }
            add(terms[n], freqs[n]);
        }
    }

    indexReader.close();
    directory.close();

    // sort map by value
    sortMap();
}

From source file:com.nearinfinity.blur.manager.IndexManager.java

License:Apache License

public static void fetchRow(IndexReader reader, String table, Selector selector, FetchResult fetchResult)
        throws CorruptIndexException, IOException {
    fetchResult.table = table;//  ww  w . j  a va 2s .  c o  m
    String locationId = selector.locationId;
    int lastSlash = locationId.lastIndexOf('/');
    int docId = Integer.parseInt(locationId.substring(lastSlash + 1));
    if (docId >= reader.maxDoc()) {
        throw new RuntimeException("Location id [" + locationId + "] with docId [" + docId + "] is not valid.");
    }
    if (selector.isRecordOnly()) {
        // select only the row for the given data or location id.
        if (reader.isDeleted(docId)) {
            fetchResult.exists = false;
            fetchResult.deleted = true;
            return;
        } else {
            fetchResult.exists = true;
            fetchResult.deleted = false;
            Document document = reader.document(docId, getFieldSelector(selector));
            fetchResult.recordResult = getColumns(document);
            return;
        }
    } else {
        if (reader.isDeleted(docId)) {
            fetchResult.exists = false;
            fetchResult.deleted = true;
            return;
        } else {
            fetchResult.exists = true;
            fetchResult.deleted = false;
            String rowId = getRowId(reader, docId);
            TermDocs termDocs = reader.termDocs(new Term(ROW_ID, rowId));
            fetchResult.rowResult = new FetchRowResult(
                    getRow(new TermDocIterable(termDocs, reader, getFieldSelector(selector))));
            return;
        }
    }
}

From source file:com.nearinfinity.blur.utils.PrimeDocCache.java

License:Apache License

/**
 * The way this method is called via warm up methods the likelihood of
 * creating multiple bitsets during a race condition is very low, that's why
 * this method is not synced./*from  w w w  .  ja v a 2  s .  c  o  m*/
 */
public static OpenBitSet getPrimeDocBitSet(IndexReader reader) throws IOException {
    Object key = reader.getCoreCacheKey();
    OpenBitSet bitSet = primeDocMap.get(key);
    if (bitSet == null) {
        reader.addReaderClosedListener(new ReaderClosedListener() {
            @Override
            public void onClose(IndexReader reader) {
                Object key = reader.getCoreCacheKey();
                LOG.debug("Current size [" + primeDocMap.size() + "] Prime Doc BitSet removing for segment ["
                        + reader + "]");
                primeDocMap.remove(key);
            }
        });
        LOG.debug("Prime Doc BitSet missing for segment [" + reader + "] current size [" + primeDocMap.size()
                + "]");
        bitSet = new OpenBitSet(reader.maxDoc());
        primeDocMap.put(key, bitSet);
        TermDocs termDocs = reader.termDocs(BlurConstants.PRIME_DOC_TERM);
        while (termDocs.next()) {
            bitSet.set(termDocs.doc());
        }
        termDocs.close();
    }
    return bitSet;
}

From source file:com.o19s.solr.swan.highlight.SpanAwareFieldTermStack.java

License:Apache License

/**
 * a constructor.//from www . j  a  v a 2s . c o  m
 * 
 * @param reader IndexReader of the index
 * @param docId document id to be highlighted
 * @param fieldName field of the document to be highlighted
 * @param fieldQuery FieldQuery object
 * @throws IOException If there is a low-level I/O error
 */
public SpanAwareFieldTermStack(IndexReader reader, int docId, String fieldName,
        final SpanAwareFieldQuery fieldQuery) throws IOException {
    this.fieldName = fieldName;

    Set<String> termSet = fieldQuery.getTermSet(fieldName);
    Set<String> alwaysHighlightTermSet = fieldQuery.getHighlightTermSet(fieldName);

    // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
    if (termSet == null)
        return;

    final Fields vectors = reader.getTermVectors(docId);
    if (vectors == null) {
        // null snippet
        return;
    }

    final Terms vector = vectors.terms(fieldName);
    if (vector == null) {
        // null snippet
        return;
    }

    final CharsRef spare = new CharsRef();
    final TermsEnum termsEnum = vector.iterator(null);
    DocsAndPositionsEnum dpEnum = null;
    BytesRef text;

    int numDocs = reader.maxDoc();
    while ((text = termsEnum.next()) != null) {
        UnicodeUtil.UTF8toUTF16(text, spare);
        final String term = spare.toString();
        if (!termSet.contains(term)) {
            continue;
        }
        dpEnum = termsEnum.docsAndPositions(null, dpEnum);
        if (dpEnum == null) {
            // null snippet
            return;
        }

        dpEnum.nextDoc();

        // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
        final float weight = (float) (Math
                .log(numDocs / (double) (reader.docFreq(new Term(fieldName, text)) + 1)) + 1.0);

        final int freq = dpEnum.freq();

        for (int i = 0; i < freq; i++) {
            int pos = dpEnum.nextPosition();
            if (dpEnum.startOffset() < 0) {
                return; // no offsets, null snippet
            }

            if (alwaysHighlightTermSet.contains(term)
                    || fieldQuery.doesDocFieldContainPosition(fieldName, docId, dpEnum.startOffset())) {
                termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight));
            }
        }

    }

    // sort by position
    Collections.sort(termList);
}

From source file:com.pjaol.lucene.search.SerialChainFilter.java

License:Apache License

@Override
//   public BitSet bits(IndexReader reader) throws CorruptIndexException, IOException {
public BitSet bits(IndexReader reader) throws IOException {

    if (chain.length == 0) {
        BitSet bits = new BitSet(reader.maxDoc());
        for (int i = 0; i < bits.size(); i++)
            bits.set(i);/*from  w  ww .  j a  va 2  s .c o  m*/
        return bits;
    }
    BitSet bits = new BitSet(reader.maxDoc());
    int chainSize = chain.length;
    int actionSize = actionType.length;
    int i = 0;

    /**
     * taken from ChainedFilter, first and on an empty bitset results in 0
     */
    if (actionType[i] == AND) {
        try {
            bits = (BitSet) chain[i].bits(reader).clone();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        ++i;
    }

    for (; i < chainSize; i++) {

        int action = (i < actionSize) ? actionType[i] : DEFAULT;

        switch (action) {

        case (SERIALAND):
            try {
                bits.and(((ISerialChainFilter) chain[i]).bits(reader, bits));
                //               } catch (CorruptIndexException e) {
                //                  // TODO Auto-generated catch block
                //                  e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            break;
        case (SERIALOR):
            try {
                bits.or(((ISerialChainFilter) chain[i]).bits(reader, bits));
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            break;
        case (AND):
            bits.and(chain[i].bits(reader));
            break;
        case (OR):
            bits.or(chain[i].bits(reader));
            break;

        }

    }
    System.out.println("Filter: " + this.toString() + " returns:" + bits.cardinality() + " docs");
    return bits;
}

From source file:com.redhat.satellite.search.index.IndexManager.java

License:Open Source License

/**
 * Removes any documents which are not related to the passed in Set of good value
 * @param ids Set of ids of all known/good values
 * @param indexName index name to operate on
 * @param uniqField the name of the field in the Document to uniquely identify
 * this record/*from  w  w w.  ja va2s . c  o  m*/
 * @return the number of documents deleted
 */
public int deleteRecordsNotInList(Set<String> ids, String indexName, String uniqField) {
    int count = 0;
    IndexReader reader = null;
    try {
        reader = getIndexReader(indexName, IndexHandler.DEFAULT_LANG);

        // Use maxDoc() to iterate over all docs, numDocs() returns the
        // number of currently alive docs leaving out the deleted ones.
        int maxDoc = reader.maxDoc();
        for (int i = 0; i < maxDoc; i++) {
            if (!reader.isDeleted(i)) {
                Document doc = reader.document(i);
                String uniqId = doc.getField(uniqField).stringValue();
                if (!ids.contains(uniqId)) {
                    log.info(indexName + ":" + uniqField + ":  <" + uniqId
                            + "> not found in list of current/good values "
                            + "assuming this has been deleted from Database and we " + "should remove it.");
                    removeFromIndex(indexName, uniqField, uniqId);
                    count++;
                }
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
        log.info("deleteRecordsNotInList() caught exception : " + e);
    } catch (IndexingException e) {
        e.printStackTrace();
        log.info("deleteRecordsNotInList() caught exception : " + e);
    } finally {
        if (reader != null) {
            try {
                reader.close();
            } catch (IOException e) {
                //
            }
        }
    }
    return count;
}

From source file:com.searchbox.SuggeterDataStructureBuilder.java

License:Apache License

private void iterateThroughDocuments(SolrIndexSearcher searcher, String[] fields, int maxNumDocs) {
    IndexReader reader = searcher.getIndexReader();
    // WARNING: returns null if there are no deletions
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    maxNumDocs = Math.min(maxNumDocs, reader.maxDoc());

    if (maxNumDocs == -1) {
        maxNumDocs = reader.maxDoc();//from ww w . j a v a2  s  .co  m
    }
    LOGGER.info("Analyzing docs:\t" + numdocs);

    for (int docID = 0; docID < reader.maxDoc(); docID++) {
        if (numdocs > maxNumDocs) {
            break;
        }
        if (liveDocs != null && !liveDocs.get(docID)) {
            continue; // deleted
        }

        if ((docID % 1000) == 0) {
            LOGGER.debug("Doing " + docID + " of " + maxNumDocs);
        }

        StringBuilder text = new StringBuilder();
        for (String field : fields) {
            /*
             * not sure if this is the best way, might make sense to do a
             * process text for each field individually, but then book
             * keeping the doc freq for terms becomes a bit of a pain in the
             * ass
             */
            try {
                IndexableField[] multifield = reader.document(docID).getFields(field);
                for (IndexableField singlefield : multifield) {
                    // create one big string from all of the text in the
                    // documents for processing later on
                    text.append(". " + singlefield.stringValue());
                }

            } catch (IOException ex) {
                LOGGER.warn("Document " + docID + " missing requested field (" + field + ")...ignoring");
            }
        }
        // might as well see if its empty
        if (text.length() > 0) {
            // actually processes the massive string which was created from
            // all of the above fields
            processText(text.toString().toLowerCase());
            numdocs++;
        }
    }

    LOGGER.info("Number of documents analyzed: \t" + numdocs);
    for (int zz = 0; zz < counts.length; zz++) {
        LOGGER.info("Number of " + zz + "-grams: \t" + counts[zz]);
    }
}

From source file:com.searchbox.Tagger.java

License:Apache License

private void DfCountBuilder(SolrIndexSearcher searcher, String[] fields, int maxNumDocs) {
    IndexReader reader = searcher.getIndexReader();
    Bits liveDocs = MultiFields.getLiveDocs(reader); // WARNING: returns null if
                                                     // there are no deletions

    maxNumDocs = Math.min(maxNumDocs, reader.maxDoc());

    if (maxNumDocs == -1) {
        maxNumDocs = reader.maxDoc();/*  w  ww .j  ava  2s  .c  om*/
    }
    LOGGER.info("Analyzing docs:\t" + numdocs);

    for (int docID = 0; docID < reader.maxDoc(); docID++) {
        if (numdocs > maxNumDocs) {
            break;
        }
        if (liveDocs != null && !liveDocs.get(docID)) {
            continue; // deleted
        }

        if ((docID % 1000) == 0) {
            LOGGER.debug("Doing " + docID + " of " + maxNumDocs);
        }

        StringBuilder text = new StringBuilder();
        for (String field : fields) { // not sure if this is the best way, might
                                      // make sense to do a
                                      // process text for each field individually, but then book keeping
                                      // the doc freq for terms becomes a bit of a pain in the ass
            try {
                text.append(". " + reader.document(docID).get(field));
            } catch (IOException ex) {
                LOGGER.warn("Document " + docID + " missing requested field (" + field + ")...ignoring");
            }
        }
        if (text.length() > 0) { // might as well see if its empty
            processDocText(text.toString());
            numdocs++;

        }
    }

    LOGGER.info("Number of documents analyzed: \t" + numdocs);
    dfcounts.put(DOC_COUNTS_STRING, numdocs);
    tfcounts.put(DOC_COUNTS_STRING, numdocs);
}

From source file:com.sxc.lucene.index.IndexingTest.java

License:Apache License

public void testIndexReader() throws IOException {
    IndexReader reader = DirectoryReader.open(directory);
    assertEquals(ids.length, reader.maxDoc()); // 8
    assertEquals(ids.length, reader.numDocs()); // 8
    reader.close();/*from  www .  ja  v a  2  s . c  om*/
}

From source file:com.tamingtext.classifier.bayes.ExtractTrainingData.java

License:Apache License

/**
 * Extract training data from a lucene index. 
 * <p>// ww  w.ja va 2s. co m
 * Iterates over documents in the lucene index, the values in the categoryFields are inspected and if found to 
 * contain any of the strings found in the category file, a training data item will be emitted, assigned to the
 * matching category and containing the terms found in the fields listed in textFields. Output is written to
 * the output directory with one file per category.
 * <p>
 * The category file contains one line per category, each line contains a number of whitespace delimited strings. 
 * The first string on each line is the category name, while subsequent strings will be used to identify documents
 * that belong in that category.
 * <p>
 * 'Technology Computers Macintosh' will cause documents that contain either 'Technology', 'Computers' or 'Machintosh'
 * in one of their categoryFields to be assigned to the 'Technology' category.
 * 
 * 
 * @param indexDir 
 *   directory of lucene index to extract from
 *   
 * @param maxDocs
 *   the maximum number of documents to process.
 *   
 * @param categoryFile
 *   file containing category strings to extract
 *   
 * @param categoryFields
 *   list of fields to match against category data
 *   
 * @param textFields
 *   list of fields containing terms to extract
 *   
 * @param outputDir
 *   directory to write output to
 *   
 * @throws IOException
 */
public static void extractTraininingData(File indexDir, File categoryFile, Collection<String> categoryFields,
        Collection<String> textFields, File outputDir, boolean useTermVectors) throws IOException {

    log.info("Index dir: " + indexDir);
    log.info("Category file: " + categoryFile);
    log.info("Output dir: " + outputDir);
    log.info("Category fields: " + categoryFields.toString());
    log.info("Text fields: " + textFields.toString());
    log.info("Use Term Vectors?: " + useTermVectors);
    OpenObjectIntHashMap<String> categoryCounts = new OpenObjectIntHashMap<String>();
    Map<String, List<String>> categories = readCategoryFile(categoryFile);

    Directory dir = FSDirectory.open(indexDir);
    IndexReader reader = IndexReader.open(dir, true);
    int max = reader.maxDoc();

    StringBuilder buf = new StringBuilder();

    for (int i = 0; i < max; i++) {
        if (!reader.isDeleted(i)) {
            Document d = reader.document(i);
            String category = null;

            // determine whether any of the fields in this document contain a 
            // category in the category list
            fields: for (String field : categoryFields) {
                for (Field f : d.getFields(field)) {
                    if (f.isStored() && !f.isBinary()) {
                        String fieldValue = f.stringValue().toLowerCase();
                        for (String cat : categories.keySet()) {
                            List<String> cats = categories.get(cat);
                            for (String c : cats) {
                                if (fieldValue.contains(c)) {
                                    category = cat;
                                    break fields;
                                }
                            }
                        }
                    }
                }
            }

            if (category == null)
                continue;

            // append the terms from each of the textFields to the training data for this document.
            buf.setLength(0);
            for (String field : textFields) {
                if (useTermVectors) {
                    appendVectorTerms(buf, reader.getTermFreqVector(i, field));
                } else {
                    appendFieldText(buf, d.getField(field));
                }
            }
            getWriterForCategory(outputDir, category).printf("%s\t%s\n", category, buf.toString());
            categoryCounts.adjustOrPutValue(category, 1, 1);
        }
    }

    if (log.isInfoEnabled()) {
        StringBuilder b = new StringBuilder();
        b.append("\nCatagory document counts:\n");
        LinkedList<String> keyList = new LinkedList<String>();
        categoryCounts.keysSortedByValue(keyList);
        String key;
        while (!keyList.isEmpty()) {
            key = keyList.removeLast();
            b.append(categoryCounts.get(key)).append('\t').append(key).append('\n');
        }
        log.info(b.toString());
    }
}