Example usage for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:it.unibz.instasearch.indexing.SearchResultDoc.java

License:Open Source License

private float[] createTermScoreVector(TermFreqVector vect, IndexReader reader) throws IOException {
    if (vect == null)
        return new float[0];

    int[] termFrequencies = vect.getTermFrequencies();
    String[] terms = vect.getTerms();
    float[] scores = new float[terms.length];

    int numDocs = reader.maxDoc();
    Similarity sim = Searcher.SIMILARITY;

    for (int i = 0; i < terms.length; i++) {
        String termText = terms[i];
        Term term = new Term(Field.CONTENTS.toString(), termText);

        float termFreq = sim.tf(termFrequencies[i]);

        int docFreq = reader.docFreq(term);
        float idf = sim.idf(docFreq, numDocs);

        float tfIdf = termFreq * idf;

        scores[i] = tfIdf;//  w w w. j  ava2  s.c  o  m
    }

    return scores;
}

From source file:lia.chapter2.IndexingTest.java

License:Apache License

@Test
public void testIndexReader() throws IOException {
    IndexReader reader = DirectoryReader.open(directory);
    assertEquals(ids.length, reader.maxDoc()); //8
    assertEquals(ids.length, reader.numDocs()); //8
    reader.close();//from   ww w .  j av a 2s .  co m
}

From source file:lia.chapter5.CategorizerTest.java

License:Apache License

private void buildCategoryVectors() throws IOException {
    IndexSearcher searcher = Utils.getBookIndexSearcher();
    IndexReader reader = searcher.getIndexReader();

    int maxDoc = reader.maxDoc();
    System.out.println(maxDoc);/*from   w w w.jav  a 2 s .  c  o m*/
    for (int i = 0; i < maxDoc; i++) {
        Document doc = reader.document(i);
        String category = doc.get("category");
        System.out.println("\n" + doc.get("subject") + "\n");
        Map vectorMap = (Map) categoryMap.get(category);
        if (vectorMap == null) {
            vectorMap = new TreeMap();
            categoryMap.put(category, vectorMap);
        }

        Terms termsVector = reader.getTermVector(i, "subject");

        addTermFreqToMap(vectorMap, termsVector);
    }
}

From source file:liredemo.flickr.TestParallelIndexer.java

License:Open Source License

/**
 * Delete all Fields besides the ones needed.
 *
 * @throws IOException/*w  w w.  j av a 2 s . co m*/
 */
public void testReduceIndex() throws IOException {
    IndexReader reader = IndexReader.open(FSDirectory.open(new File("./index-mirflickr")));
    IndexWriter writer = LuceneUtils.createIndexWriter(FSDirectory.open(new File("./mirflickr-data-vw")), true,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer);
    //        IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath)));
    //        IndexWriter writer = LuceneUtils.createIndexWriter(indexPath + "-reduced", true);
    int maxDocs = reader.maxDoc();
    Document d;
    for (int i = 0; i < maxDocs; i++) {
        /*            if (!reader.isDeleted(0)) {
        d = reader.document(i);
        Document writeDoc = new Document();
        writeDoc.add(d.getFieldable(DocumentBuilder.FIELD_NAME_CEDD));
        //                writeDoc.add(d.getFieldable(DocumentBuilder.FIELD_NAME_IDENTIFIER));
        writeDoc.add(d.getFieldable("tags"));
        writeDoc.add(d.getFieldable(DocumentBuilder.FIELD_NAME_SURF_LOCAL_FEATURE_HISTOGRAM_VISUAL_WORDS));
        writer.addDocument(writeDoc);
                    }*/
    }
    writer.close();
}

From source file:liredemo.flickr.TestParallelIndexer.java

License:Open Source License

public void testSearchTime() throws IOException {
    ImageSearcher ceddImageSearcher = new VisualWordsImageSearcher(100,
            DocumentBuilder.FIELD_NAME_SURF + DocumentBuilder.FIELD_NAME_BOVW);
    //        ImageSearcher ceddImageSearcher = ImageSearcherFactory.createCEDDImageSearcher(100);
    IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath)));
    //        IndexReader reader = IndexReader.open(new RAMDirectory(FSDirectory.open(new File(indexPath + "-reduced"))));
    System.out.println("reader.maxDoc() = " + reader.maxDoc());
    for (int i = 0; i < 10; i++) {
        long ms = System.currentTimeMillis();
        ceddImageSearcher.search(reader.document(0), reader);
        System.out.println("s = " + (double) (System.currentTimeMillis() - ms) / 1000d);
    }//from w  ww .jav  a  2  s .  c o  m
}

From source file:lucandra.LucandraFilter.java

License:Apache License

public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
    OpenBitSet result = new OpenBitSet(reader.maxDoc());

    Map<Integer, String> filterMap = ((lucandra.IndexReader) reader).getDocIndexToDocId();

    List<String> filteredValues = new ArrayList<String>();
    for (Map.Entry<Integer, String> entry : filterMap.entrySet()) {
        filteredValues.add(entry.getValue());
    }//from ww  w. ja  v  a2  s  .c o  m

    if (filteredValues.size() == 0)
        return null;

    LucandraTermDocs termDocs = (LucandraTermDocs) reader.termDocs();

    for (Term term : terms) {
        List<ColumnOrSuperColumn> terms = termDocs.filteredSeek(term, filteredValues);
        // This is a conjunction and at least one value must match
        if (terms == null)
            return null;

        while (termDocs.next()) {
            result.set(termDocs.doc());
        }
    }
    termDocs.close();
    return result;
}

From source file:lucee.runtime.search.lucene2.LuceneSearchCollection.java

License:Open Source License

/**
 * @param id/* w w w .  j a v  a  2  s. c  om*/
 * @param title
 * @param keyColumn
 * @param bodyColumns
 * @param language
 * @param custom1
 * @param custom2
 * @param custom3
 * @param custom4
 * @return 
 * @throws SearchException
 */
protected IndexResult _deleteCustom(String id, QueryColumn keyColumn) throws SearchException {

    int countBefore = 0;
    int countAfter = 0;

    Map<String, Document> docs = new HashMap<String, Document>();

    Set<String> keys = toSet(keyColumn);
    IndexWriter writer = null;
    String key;
    IndexReader reader = null;
    Document doc;

    synchronized (token) {
        try {
            try {
                reader = _getReader(id, false);
                countBefore = reader.maxDoc();
                for (int i = 0; i < countBefore; i++) {
                    doc = reader.document(i);
                    key = doc.getField("key").stringValue();
                    if (!keys.contains(key))
                        docs.put(key, doc);
                }
            } catch (Exception e) {
            } finally {
                close(reader);
            }
            countAfter = docs.size();

            writer = _getWriter(id, true);
            Iterator<Entry<String, Document>> it = docs.entrySet().iterator();
            while (it.hasNext()) {
                writer.addDocument(it.next().getValue());
            }
            optimizeEL(writer);

        } catch (IOException e) {
            throw new SearchException(e);
        } finally {
            close(writer);
        }
        indexSpellCheck(id);
    }
    int removes = countBefore - countAfter;

    return new IndexResultImpl(removes, 0, 0);
}

From source file:lucee.runtime.search.lucene2.LuceneSearchCollection.java

License:Open Source License

/**
  * @param id/* w w  w .  jav  a  2 s . co  m*/
  * @param title
  * @param keyColumn
  * @param bodyColumns
  * @param language
  * @param custom1
  * @param custom2
  * @param custom3
  * @param custom4
  * @return 
  * @throws SearchException
  */
protected IndexResult _indexCustom(String id, Object title, QueryColumn keyColumn, QueryColumn[] bodyColumns,
        String language, Object urlpath, Object custom1, Object custom2, Object custom3, Object custom4)
        throws SearchException {
    _checkLanguage(language);
    String t;
    String url;
    String c1;
    String c2;
    String c3;
    String c4;

    int countExisting = 0;
    int countAdd = keyColumn.size();
    int countNew = 0;

    Map<String, Document> docs = new HashMap<String, Document>();
    IndexWriter writer = null;
    synchronized (token) {
        try {
            // read existing reader
            IndexReader reader = null;
            try {
                reader = _getReader(id, false);
                int len = reader.maxDoc();
                Document doc;
                for (int i = 0; i < len; i++) {
                    doc = reader.document(i);
                    docs.put(doc.getField("key").stringValue(), doc);
                }
            } catch (Exception e) {
            } finally {
                close(reader);
            }

            countExisting = docs.size();
            writer = _getWriter(id, true);
            int len = keyColumn.size();
            String key;
            for (int i = 1; i <= len; i++) {
                key = Caster.toString(keyColumn.get(i, null), null);
                if (key == null)
                    continue;

                StringBuilder body = new StringBuilder();
                for (int y = 0; y < bodyColumns.length; y++) {
                    Object tmp = bodyColumns[y].get(i, null);
                    if (tmp != null) {
                        body.append(tmp.toString());
                        body.append(' ');
                    }
                }
                //t=(title==null)?null:Caster.toString(title.get(i,null),null);
                //url=(urlpath==null)?null:Caster.toString(urlpath.get(i,null),null);

                t = getRow(title, i);
                url = getRow(urlpath, i);
                c1 = getRow(custom1, i);
                c2 = getRow(custom2, i);
                c3 = getRow(custom3, i);
                c4 = getRow(custom4, i);

                docs.put(key, CustomDocument.getDocument(t, key, body.toString(), url, c1, c2, c3, c4));
            }
            countNew = docs.size();
            Iterator<Entry<String, Document>> it = docs.entrySet().iterator();
            Entry<String, Document> entry;
            Document doc;
            while (it.hasNext()) {
                entry = it.next();
                doc = entry.getValue();
                writer.addDocument(doc);
            }
            optimizeEL(writer);
            //writer.optimize();

        } catch (IOException ioe) {
            throw new SearchException(ioe);
        } finally {
            close(writer);
        }
        indexSpellCheck(id);
    }
    int inserts = countNew - countExisting;

    return new IndexResultImpl(0, inserts, countAdd - inserts);
}

From source file:lucenesearch.Mallet.java

public void getMalletAllOutput() throws IOException {

    String index = new Searcher().getPostIndexPath();
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));

    PrintWriter pw = new PrintWriter("./data/mallet_all.txt");

    StringBuilder sb = new StringBuilder();

    for (int i = 0; i < reader.maxDoc(); i++) {
        Document doc = reader.document(i);
        System.out.println("Doc " + i);

        ArrayList<String> res = LuceneUtils.getAnalyzedRemoveHtml(doc.get("Body"));

        int id = Integer.parseInt(doc.get("SId"));
        sb = new StringBuilder();
        sb.append(id);/*  w ww  .ja v  a  2 s  .c o  m*/
        sb.append("\t");
        for (String re : res) {
            re = re.replaceAll("\r\n", " ").replaceAll("\n", " ").replaceAll("<.+?>", "").replaceAll(" +", " ")
                    .replaceAll("[^\\x00-\\x7F]", " ").trim();
            sb.append(re).append(" ");
        }
        sb.append("\n");
        pw.print(sb.toString());

    }
    pw.close();

}

From source file:lucenetools.TermData.java

License:Apache License

/**
 * Main application. /*  w ww.  j a  v a  2s  . co  m*/
 *
 * @param args the command line arguments
 */
public static void main(String[] args) {
    Options opts = new Options();
    CommandLine commandLine = new CommandLine();

    // if no command line options specified, user wants help
    if (0 == args.length) {
        commandLine.showHelp();
        System.exit(0);
    }

    // extract command line args and store in opts
    if (!commandLine.parse(args, opts))
        System.exit(1);

    if (opts.showHelp) {
        commandLine.showHelp();
        System.exit(0);
    }

    // validate all command line options
    if (!commandLine.isValid(opts))
        System.exit(1);

    // report all command line options to the user
    System.out.println("\nLuceneToMtx version " + VERSION + ".");
    commandLine.printOpts(opts);

    long maxMemory = Runtime.getRuntime().maxMemory() / 1024 / 1024;
    System.out.println("Java runtime max memory: " + maxMemory + " MB.");

    // Build a map and assign a dictionary index to each term.
    // Include only those terms that survive the min term freq cutoff.
    Map<String, Integer> dictMap = new TreeMap<>();

    File file = null;
    System.out.println("Processing index...");
    try {
        file = new File(opts.indexDir);
        IndexReader reader = DirectoryReader.open(FSDirectory.open(file));
        TermsEnum te = null;
        int nnz = 0, numCols = 0, maxDocs = reader.maxDoc();
        LinkedList<FeatureVector> matrixData = new LinkedList<>();

        // add other fields
        Collection<String> fields = new ArrayList<>();
        if (opts.fields > 0) {
            fields = MultiFields.getIndexedFields(reader);
            fields.remove(CONTENTSFIELD);
            fields.remove(PATHFIELD);
        }

        if (!extractTerms(reader, dictMap, opts.minTermFreq, maxDocs - 1, opts.maxTermPercentage))
            System.exit(1);

        // set of field names to extract
        Set<String> fieldSet = new HashSet<>();
        fieldSet.add(PATHFIELD);
        for (String s : fields) {
            fieldSet.add(s);
        }

        for (int i = 0; i < maxDocs; ++i) {
            // get term vector for next document
            Terms terms = reader.getTermVector(i, CONTENTSFIELD);
            if (terms == null)
                continue;

            te = terms.iterator(te);
            FeatureVector fv = new FeatureVector(numCols);

            int numEntries = buildFeatureVector(fv, te, dictMap);
            if (numEntries > 0) {
                // extract document path and save with FeatureVector
                Document doc = reader.document(i, fieldSet);
                fv.docPath = doc.get(PATHFIELD);

                // add any additional fields
                for (String s : fields) {
                    fv.fields.put(s, doc.get(s));
                }

                //System.out.println("processing document:" + fv.docPath);

                matrixData.add(fv);
                nnz += numEntries;
                ++numCols;
            }
        }

        // Sort the feature vectors by their document path field.  Write 
        // the matrix columns in this sorted order.
        Collections.sort(matrixData, new FeatureVectorComparator());
        File outdir = new File(opts.outDir);
        writeMatrixMarketFile(new File(outdir, MATRIXFILE), matrixData, dictMap.size(), numCols, nnz);
        System.out.println("Wrote " + MATRIXFILE + ".");
        writeDictionaryFile(new File(outdir, DICTFILE), dictMap);
        System.out.println("Wrote " + DICTFILE + ".");
        writeDocumentFile(new File(outdir, DOCFILE), matrixData);
        System.out.println("Wrote " + DOCFILE + ".");
        writeFieldFiles(outdir, fields, matrixData);
    } catch (IndexNotFoundException e) {
        if (null != file) {
            System.out.println("Lucene index not found in: " + file.getAbsolutePath());
        }
    } catch (IOException e) {
        System.out.println("LuceneToMtx exception: caught a " + e.getClass() + "\nMessage: " + e.getMessage());
    }
}