Example usage for org.apache.lucene.index IndexReader maxDoc

List of usage examples for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:com.mmiagency.knime.nodes.keyworddensity.util.KeywordDensityHelper.java

License:Open Source License

public void execute() throws IOException {

    org.jsoup.nodes.Document jdoc = null;

    // pull content using Jsoup 
    if (m_content != null && !m_content.trim().isEmpty()) {
        jdoc = Jsoup.parse(m_content);//from  w ww.j  a v  a 2 s.  co m
    } else {
        Connection conn = Jsoup.connect(m_url);

        conn.validateTLSCertificates(false);
        conn.followRedirects(true);
        conn.userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0");
        conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        conn.header("Accept-Language", "en-US,en;q=0.5");
        conn.header("Accept-Encoding", "gzip, deflate");

        conn.execute();
        jdoc = conn.get();
    }

    StringWriter text = new StringWriter();

    if (m_includeMetaKeywords) {
        text.write(jdoc.select("meta[name=keywords]").attr("content"));
        text.write(" ");
    }
    if (m_includeMetaDescription) {
        text.write(jdoc.select("meta[name=description]").attr("content"));
        text.write(" ");
    }
    if (m_includePageTitle) {
        text.write(jdoc.select("title").text());
        text.write(" ");
    }

    text.write(jdoc.select("body").text());

    // analyze content with Lucene
    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    Directory directory = new RAMDirectory();
    IndexWriter indexWriter = new IndexWriter(directory, analyzer, MaxFieldLength.LIMITED);

    Document doc = new Document();
    Field textField = new Field("content", text.toString(), Field.Store.YES, Field.Index.ANALYZED,
            TermVector.WITH_POSITIONS_OFFSETS);

    doc.add(textField);

    indexWriter.addDocument(doc);
    indexWriter.commit();
    indexWriter.close();

    IndexReader indexReader = IndexReader.open(directory, true);

    TermFreqVector termFreqVector = null;

    for (int i = 0; i < indexReader.maxDoc(); i++) {
        termFreqVector = indexReader.getTermFreqVector(i, "content");

        String[] terms = termFreqVector.getTerms();
        int[] freqs = termFreqVector.getTermFrequencies();

        for (int n = 0; n < termFreqVector.size(); n++) {
            if (m_excludeList.contains(terms[n])) {
                continue;
            }
            add(terms[n], freqs[n]);
        }
    }

    indexReader.close();
    directory.close();

    // sort map by value
    sortMap();
}

From source file:com.nearinfinity.blur.manager.IndexManager.java

License:Apache License

public static void fetchRow(IndexReader reader, String table, Selector selector, FetchResult fetchResult)
        throws CorruptIndexException, IOException {
    fetchResult.table = table;//  ww  w . j  a va 2s .  c o  m
    String locationId = selector.locationId;
    int lastSlash = locationId.lastIndexOf('/');
    int docId = Integer.parseInt(locationId.substring(lastSlash + 1));
    if (docId >= reader.maxDoc()) {
        throw new RuntimeException("Location id [" + locationId + "] with docId [" + docId + "] is not valid.");
    }
    if (selector.isRecordOnly()) {
        // select only the row for the given data or location id.
        if (reader.isDeleted(docId)) {
            fetchResult.exists = false;
            fetchResult.deleted = true;
            return;
        } else {
            fetchResult.exists = true;
            fetchResult.deleted = false;
            Document document = reader.document(docId, getFieldSelector(selector));
            fetchResult.recordResult = getColumns(document);
            return;
        }
    } else {
        if (reader.isDeleted(docId)) {
            fetchResult.exists = false;
            fetchResult.deleted = true;
            return;
        } else {
            fetchResult.exists = true;
            fetchResult.deleted = false;
            String rowId = getRowId(reader, docId);
            TermDocs termDocs = reader.termDocs(new Term(ROW_ID, rowId));
            fetchResult.rowResult = new FetchRowResult(
                    getRow(new TermDocIterable(termDocs, reader, getFieldSelector(selector))));
            return;
        }
    }
}

From source file:com.nearinfinity.blur.utils.PrimeDocCache.java

License:Apache License

/**
 * The way this method is called via warm up methods the likelihood of
 * creating multiple bitsets during a race condition is very low, that's why
 * this method is not synced./*from  w w w  .  ja v a 2  s .  c  o  m*/
 */
public static OpenBitSet getPrimeDocBitSet(IndexReader reader) throws IOException {
    Object key = reader.getCoreCacheKey();
    OpenBitSet bitSet = primeDocMap.get(key);
    if (bitSet == null) {
        reader.addReaderClosedListener(new ReaderClosedListener() {
            @Override
            public void onClose(IndexReader reader) {
                Object key = reader.getCoreCacheKey();
                LOG.debug("Current size [" + primeDocMap.size() + "] Prime Doc BitSet removing for segment ["
                        + reader + "]");
                primeDocMap.remove(key);
            }
        });
        LOG.debug("Prime Doc BitSet missing for segment [" + reader + "] current size [" + primeDocMap.size()
                + "]");
        bitSet = new OpenBitSet(reader.maxDoc());
        primeDocMap.put(key, bitSet);
        TermDocs termDocs = reader.termDocs(BlurConstants.PRIME_DOC_TERM);
        while (termDocs.next()) {
            bitSet.set(termDocs.doc());
        }
        termDocs.close();
    }
    return bitSet;
}

From source file:com.o19s.solr.swan.highlight.SpanAwareFieldTermStack.java

License:Apache License

/**
 * a constructor.//from www . j  a  v a 2s . c o  m
 * 
 * @param reader IndexReader of the index
 * @param docId document id to be highlighted
 * @param fieldName field of the document to be highlighted
 * @param fieldQuery FieldQuery object
 * @throws IOException If there is a low-level I/O error
 */
public SpanAwareFieldTermStack(IndexReader reader, int docId, String fieldName,
        final SpanAwareFieldQuery fieldQuery) throws IOException {
    this.fieldName = fieldName;

    Set<String> termSet = fieldQuery.getTermSet(fieldName);
    Set<String> alwaysHighlightTermSet = fieldQuery.getHighlightTermSet(fieldName);

    // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
    if (termSet == null)
        return;

    final Fields vectors = reader.getTermVectors(docId);
    if (vectors == null) {
        // null snippet
        return;
    }

    final Terms vector = vectors.terms(fieldName);
    if (vector == null) {
        // null snippet
        return;
    }

    final CharsRef spare = new CharsRef();
    final TermsEnum termsEnum = vector.iterator(null);
    DocsAndPositionsEnum dpEnum = null;
    BytesRef text;

    int numDocs = reader.maxDoc();
    while ((text = termsEnum.next()) != null) {
        UnicodeUtil.UTF8toUTF16(text, spare);
        final String term = spare.toString();
        if (!termSet.contains(term)) {
            continue;
        }
        dpEnum = termsEnum.docsAndPositions(null, dpEnum);
        if (dpEnum == null) {
            // null snippet
            return;
        }

        dpEnum.nextDoc();

        // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
        final float weight = (float) (Math
                .log(numDocs / (double) (reader.docFreq(new Term(fieldName, text)) + 1)) + 1.0);

        final int freq = dpEnum.freq();

        for (int i = 0; i < freq; i++) {
            int pos = dpEnum.nextPosition();
            if (dpEnum.startOffset() < 0) {
                return; // no offsets, null snippet
            }

            if (alwaysHighlightTermSet.contains(term)
                    || fieldQuery.doesDocFieldContainPosition(fieldName, docId, dpEnum.startOffset())) {
                termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight));
            }
        }

    }

    // sort by position
    Collections.sort(termList);
}

From source file:com.pjaol.lucene.search.SerialChainFilter.java

License:Apache License

@Override
//   public BitSet bits(IndexReader reader) throws CorruptIndexException, IOException {
public BitSet bits(IndexReader reader) throws IOException {

    if (chain.length == 0) {
        BitSet bits = new BitSet(reader.maxDoc());
        for (int i = 0; i < bits.size(); i++)
            bits.set(i);/*from  w  ww .  j a  va 2  s .c o  m*/
        return bits;
    }
    BitSet bits = new BitSet(reader.maxDoc());
    int chainSize = chain.length;
    int actionSize = actionType.length;
    int i = 0;

    /**
     * taken from ChainedFilter, first and on an empty bitset results in 0
     */
    if (actionType[i] == AND) {
        try {
            bits = (BitSet) chain[i].bits(reader).clone();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        ++i;
    }

    for (; i < chainSize; i++) {

        int action = (i < actionSize) ? actionType[i] : DEFAULT;

        switch (action) {

        case (SERIALAND):
            try {
                bits.and(((ISerialChainFilter) chain[i]).bits(reader, bits));
                //               } catch (CorruptIndexException e) {
                //                  // TODO Auto-generated catch block
                //                  e.printStackTrace();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            break;
        case (SERIALOR):
            try {
                bits.or(((ISerialChainFilter) chain[i]).bits(reader, bits));
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            break;
        case (AND):
            bits.and(chain[i].bits(reader));
            break;
        case (OR):
            bits.or(chain[i].bits(reader));
            break;

        }

    }
    System.out.println("Filter: " + this.toString() + " returns:" + bits.cardinality() + " docs");
    return bits;
}

From source file:com.redhat.satellite.search.index.IndexManager.java

License:Open Source License

/**
 * Removes any documents which are not related to the passed in Set of good value
 * @param ids Set of ids of all known/good values
 * @param indexName index name to operate on
 * @param uniqField the name of the field in the Document to uniquely identify
 * this record/*from  w  w w.  ja va2s . c  o  m*/
 * @return the number of documents deleted
 */
public int deleteRecordsNotInList(Set<String> ids, String indexName, String uniqField) {
    int count = 0;
    IndexReader reader = null;
    try {
        reader = getIndexReader(indexName, IndexHandler.DEFAULT_LANG);

        // Use maxDoc() to iterate over all docs, numDocs() returns the
        // number of currently alive docs leaving out the deleted ones.
        int maxDoc = reader.maxDoc();
        for (int i = 0; i < maxDoc; i++) {
            if (!reader.isDeleted(i)) {
                Document doc = reader.document(i);
                String uniqId = doc.getField(uniqField).stringValue();
                if (!ids.contains(uniqId)) {
                    log.info(indexName + ":" + uniqField + ":  <" + uniqId
                            + "> not found in list of current/good values "
                            + "assuming this has been deleted from Database and we " + "should remove it.");
                    removeFromIndex(indexName, uniqField, uniqId);
                    count++;
                }
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
        log.info("deleteRecordsNotInList() caught exception : " + e);
    } catch (IndexingException e) {
        e.printStackTrace();
        log.info("deleteRecordsNotInList() caught exception : " + e);
    } finally {
        if (reader != null) {
            try {
                reader.close();
            } catch (IOException e) {
                //
            }
        }
    }
    return count;
}

From source file:com.searchbox.SuggeterDataStructureBuilder.java

License:Apache License

private void iterateThroughDocuments(SolrIndexSearcher searcher, String[] fields, int maxNumDocs) {
    IndexReader reader = searcher.getIndexReader();
    // WARNING: returns null if there are no deletions
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    maxNumDocs = Math.min(maxNumDocs, reader.maxDoc());

    if (maxNumDocs == -1) {
        maxNumDocs = reader.maxDoc();//from ww w . j a v a2  s  .co  m
    }
    LOGGER.info("Analyzing docs:\t" + numdocs);

    for (int docID = 0; docID < reader.maxDoc(); docID++) {
        if (numdocs > maxNumDocs) {
            break;
        }
        if (liveDocs != null && !liveDocs.get(docID)) {
            continue; // deleted
        }

        if ((docID % 1000) == 0) {
            LOGGER.debug("Doing " + docID + " of " + maxNumDocs);
        }

        StringBuilder text = new StringBuilder();
        for (String field : fields) {
            /*
             * not sure if this is the best way, might make sense to do a
             * process text for each field individually, but then book
             * keeping the doc freq for terms becomes a bit of a pain in the
             * ass
             */
            try {
                IndexableField[] multifield = reader.document(docID).getFields(field);
                for (IndexableField singlefield : multifield) {
                    // create one big string from all of the text in the
                    // documents for processing later on
                    text.append(". " + singlefield.stringValue());
                }

            } catch (IOException ex) {
                LOGGER.warn("Document " + docID + " missing requested field (" + field + ")...ignoring");
            }
        }
        // might as well see if its empty
        if (text.length() > 0) {
            // actually processes the massive string which was created from
            // all of the above fields
            processText(text.toString().toLowerCase());
            numdocs++;
        }
    }

    LOGGER.info("Number of documents analyzed: \t" + numdocs);
    for (int zz = 0; zz < counts.length; zz++) {
        LOGGER.info("Number of " + zz + "-grams: \t" + counts[zz]);
    }
}

From source file:com.searchbox.Tagger.java

License:Apache License

private void DfCountBuilder(SolrIndexSearcher searcher, String[] fields, int maxNumDocs) {
    IndexReader reader = searcher.getIndexReader();
    Bits liveDocs = MultiFields.getLiveDocs(reader); // WARNING: returns null if
                                                     // there are no deletions

    maxNumDocs = Math.min(maxNumDocs, reader.maxDoc());

    if (maxNumDocs == -1) {
        maxNumDocs = reader.maxDoc();/*  w  ww .j  ava  2s  .c  om*/
    }
    LOGGER.info("Analyzing docs:\t" + numdocs);

    for (int docID = 0; docID < reader.maxDoc(); docID++) {
        if (numdocs > maxNumDocs) {
            break;
        }
        if (liveDocs != null && !liveDocs.get(docID)) {
            continue; // deleted
        }

        if ((docID % 1000) == 0) {
            LOGGER.debug("Doing " + docID + " of " + maxNumDocs);
        }

        StringBuilder text = new StringBuilder();
        for (String field : fields) { // not sure if this is the best way, might
                                      // make sense to do a
                                      // process text for each field individually, but then book keeping
                                      // the doc freq for terms becomes a bit of a pain in the ass
            try {
                text.append(". " + reader.document(docID).get(field));
            } catch (IOException ex) {
                LOGGER.warn("Document " + docID + " missing requested field (" + field + ")...ignoring");
            }
        }
        if (text.length() > 0) { // might as well see if its empty
            processDocText(text.toString());
            numdocs++;

        }
    }

    LOGGER.info("Number of documents analyzed: \t" + numdocs);
    dfcounts.put(DOC_COUNTS_STRING, numdocs);
    tfcounts.put(DOC_COUNTS_STRING, numdocs);
}

From source file:com.sxc.lucene.index.IndexingTest.java

License:Apache License

public void testIndexReader() throws IOException {
    IndexReader reader = DirectoryReader.open(directory);
    assertEquals(ids.length, reader.maxDoc()); // 8
    assertEquals(ids.length, reader.numDocs()); // 8
    reader.close();/*from  www .  ja  v a  2  s . c  om*/
}

From source file:com.tamingtext.classifier.bayes.ExtractTrainingData.java

License:Apache License

/**
 * Extract training data from a lucene index. 
 * <p>// ww  w.ja va 2s. co m
 * Iterates over documents in the lucene index, the values in the categoryFields are inspected and if found to 
 * contain any of the strings found in the category file, a training data item will be emitted, assigned to the
 * matching category and containing the terms found in the fields listed in textFields. Output is written to
 * the output directory with one file per category.
 * <p>
 * The category file contains one line per category, each line contains a number of whitespace delimited strings. 
 * The first string on each line is the category name, while subsequent strings will be used to identify documents
 * that belong in that category.
 * <p>
 * 'Technology Computers Macintosh' will cause documents that contain either 'Technology', 'Computers' or 'Machintosh'
 * in one of their categoryFields to be assigned to the 'Technology' category.
 * 
 * 
 * @param indexDir 
 *   directory of lucene index to extract from
 *   
 * @param maxDocs
 *   the maximum number of documents to process.
 *   
 * @param categoryFile
 *   file containing category strings to extract
 *   
 * @param categoryFields
 *   list of fields to match against category data
 *   
 * @param textFields
 *   list of fields containing terms to extract
 *   
 * @param outputDir
 *   directory to write output to
 *   
 * @throws IOException
 */
public static void extractTraininingData(File indexDir, File categoryFile, Collection<String> categoryFields,
        Collection<String> textFields, File outputDir, boolean useTermVectors) throws IOException {

    log.info("Index dir: " + indexDir);
    log.info("Category file: " + categoryFile);
    log.info("Output dir: " + outputDir);
    log.info("Category fields: " + categoryFields.toString());
    log.info("Text fields: " + textFields.toString());
    log.info("Use Term Vectors?: " + useTermVectors);
    OpenObjectIntHashMap<String> categoryCounts = new OpenObjectIntHashMap<String>();
    Map<String, List<String>> categories = readCategoryFile(categoryFile);

    Directory dir = FSDirectory.open(indexDir);
    IndexReader reader = IndexReader.open(dir, true);
    int max = reader.maxDoc();

    StringBuilder buf = new StringBuilder();

    for (int i = 0; i < max; i++) {
        if (!reader.isDeleted(i)) {
            Document d = reader.document(i);
            String category = null;

            // determine whether any of the fields in this document contain a 
            // category in the category list
            fields: for (String field : categoryFields) {
                for (Field f : d.getFields(field)) {
                    if (f.isStored() && !f.isBinary()) {
                        String fieldValue = f.stringValue().toLowerCase();
                        for (String cat : categories.keySet()) {
                            List<String> cats = categories.get(cat);
                            for (String c : cats) {
                                if (fieldValue.contains(c)) {
                                    category = cat;
                                    break fields;
                                }
                            }
                        }
                    }
                }
            }

            if (category == null)
                continue;

            // append the terms from each of the textFields to the training data for this document.
            buf.setLength(0);
            for (String field : textFields) {
                if (useTermVectors) {
                    appendVectorTerms(buf, reader.getTermFreqVector(i, field));
                } else {
                    appendFieldText(buf, d.getField(field));
                }
            }
            getWriterForCategory(outputDir, category).printf("%s\t%s\n", category, buf.toString());
            categoryCounts.adjustOrPutValue(category, 1, 1);
        }
    }

    if (log.isInfoEnabled()) {
        StringBuilder b = new StringBuilder();
        b.append("\nCatagory document counts:\n");
        LinkedList<String> keyList = new LinkedList<String>();
        categoryCounts.keysSortedByValue(keyList);
        String key;
        while (!keyList.isEmpty()) {
            key = keyList.removeLast();
            b.append(categoryCounts.get(key)).append('\t').append(key).append('\n');
        }
        log.info(b.toString());
    }
}