Example usage for org.apache.lucene.index IndexReader maxDoc

List of usage examples for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:it.unibz.instasearch.indexing.SearchResultDoc.java

License:Open Source License

private float[] createTermScoreVector(TermFreqVector vect, IndexReader reader) throws IOException {
    if (vect == null)
        return new float[0];

    int[] termFrequencies = vect.getTermFrequencies();
    String[] terms = vect.getTerms();
    float[] scores = new float[terms.length];

    int numDocs = reader.maxDoc();
    Similarity sim = Searcher.SIMILARITY;

    for (int i = 0; i < terms.length; i++) {
        String termText = terms[i];
        Term term = new Term(Field.CONTENTS.toString(), termText);

        float termFreq = sim.tf(termFrequencies[i]);

        int docFreq = reader.docFreq(term);
        float idf = sim.idf(docFreq, numDocs);

        float tfIdf = termFreq * idf;

        scores[i] = tfIdf;//  w w w. j  ava2  s.c  o  m
    }

    return scores;
}

From source file:lia.chapter2.IndexingTest.java

License:Apache License

@Test
public void testIndexReader() throws IOException {
    IndexReader reader = DirectoryReader.open(directory);
    assertEquals(ids.length, reader.maxDoc()); //8
    assertEquals(ids.length, reader.numDocs()); //8
    reader.close();//from   ww w .  j av a 2s .  co m
}

From source file:lia.chapter5.CategorizerTest.java

License:Apache License

private void buildCategoryVectors() throws IOException {
    IndexSearcher searcher = Utils.getBookIndexSearcher();
    IndexReader reader = searcher.getIndexReader();

    int maxDoc = reader.maxDoc();
    System.out.println(maxDoc);/*from   w w w.jav  a 2 s .  c  o m*/
    for (int i = 0; i < maxDoc; i++) {
        Document doc = reader.document(i);
        String category = doc.get("category");
        System.out.println("\n" + doc.get("subject") + "\n");
        Map vectorMap = (Map) categoryMap.get(category);
        if (vectorMap == null) {
            vectorMap = new TreeMap();
            categoryMap.put(category, vectorMap);
        }

        Terms termsVector = reader.getTermVector(i, "subject");

        addTermFreqToMap(vectorMap, termsVector);
    }
}

From source file:liredemo.flickr.TestParallelIndexer.java

License:Open Source License

/**
 * Delete all Fields besides the ones needed.
 *
 * @throws IOException/*w  w w.  j av a 2 s . co m*/
 */
public void testReduceIndex() throws IOException {
    IndexReader reader = IndexReader.open(FSDirectory.open(new File("./index-mirflickr")));
    IndexWriter writer = LuceneUtils.createIndexWriter(FSDirectory.open(new File("./mirflickr-data-vw")), true,
            LuceneUtils.AnalyzerType.WhitespaceAnalyzer);
    //        IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath)));
    //        IndexWriter writer = LuceneUtils.createIndexWriter(indexPath + "-reduced", true);
    int maxDocs = reader.maxDoc();
    Document d;
    for (int i = 0; i < maxDocs; i++) {
        /*            if (!reader.isDeleted(0)) {
        d = reader.document(i);
        Document writeDoc = new Document();
        writeDoc.add(d.getFieldable(DocumentBuilder.FIELD_NAME_CEDD));
        //                writeDoc.add(d.getFieldable(DocumentBuilder.FIELD_NAME_IDENTIFIER));
        writeDoc.add(d.getFieldable("tags"));
        writeDoc.add(d.getFieldable(DocumentBuilder.FIELD_NAME_SURF_LOCAL_FEATURE_HISTOGRAM_VISUAL_WORDS));
        writer.addDocument(writeDoc);
                    }*/
    }
    writer.close();
}

From source file:liredemo.flickr.TestParallelIndexer.java

License:Open Source License

public void testSearchTime() throws IOException {
    ImageSearcher ceddImageSearcher = new VisualWordsImageSearcher(100,
            DocumentBuilder.FIELD_NAME_SURF + DocumentBuilder.FIELD_NAME_BOVW);
    //        ImageSearcher ceddImageSearcher = ImageSearcherFactory.createCEDDImageSearcher(100);
    IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath)));
    //        IndexReader reader = IndexReader.open(new RAMDirectory(FSDirectory.open(new File(indexPath + "-reduced"))));
    System.out.println("reader.maxDoc() = " + reader.maxDoc());
    for (int i = 0; i < 10; i++) {
        long ms = System.currentTimeMillis();
        ceddImageSearcher.search(reader.document(0), reader);
        System.out.println("s = " + (double) (System.currentTimeMillis() - ms) / 1000d);
    }//from w  ww .jav  a  2  s .  c o  m
}

From source file:lucandra.LucandraFilter.java

License:Apache License

public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
    OpenBitSet result = new OpenBitSet(reader.maxDoc());

    Map<Integer, String> filterMap = ((lucandra.IndexReader) reader).getDocIndexToDocId();

    List<String> filteredValues = new ArrayList<String>();
    for (Map.Entry<Integer, String> entry : filterMap.entrySet()) {
        filteredValues.add(entry.getValue());
    }//from ww  w. ja  v  a2  s  .c o  m

    if (filteredValues.size() == 0)
        return null;

    LucandraTermDocs termDocs = (LucandraTermDocs) reader.termDocs();

    for (Term term : terms) {
        List<ColumnOrSuperColumn> terms = termDocs.filteredSeek(term, filteredValues);
        // This is a conjunction and at least one value must match
        if (terms == null)
            return null;

        while (termDocs.next()) {
            result.set(termDocs.doc());
        }
    }
    termDocs.close();
    return result;
}

From source file:lucee.runtime.search.lucene2.LuceneSearchCollection.java

License:Open Source License

/**
 * @param id/* w w w .  j a v  a  2  s. c  om*/
 * @param title
 * @param keyColumn
 * @param bodyColumns
 * @param language
 * @param custom1
 * @param custom2
 * @param custom3
 * @param custom4
 * @return 
 * @throws SearchException
 */
protected IndexResult _deleteCustom(String id, QueryColumn keyColumn) throws SearchException {

    int countBefore = 0;
    int countAfter = 0;

    Map<String, Document> docs = new HashMap<String, Document>();

    Set<String> keys = toSet(keyColumn);
    IndexWriter writer = null;
    String key;
    IndexReader reader = null;
    Document doc;

    synchronized (token) {
        try {
            try {
                reader = _getReader(id, false);
                countBefore = reader.maxDoc();
                for (int i = 0; i < countBefore; i++) {
                    doc = reader.document(i);
                    key = doc.getField("key").stringValue();
                    if (!keys.contains(key))
                        docs.put(key, doc);
                }
            } catch (Exception e) {
            } finally {
                close(reader);
            }
            countAfter = docs.size();

            writer = _getWriter(id, true);
            Iterator<Entry<String, Document>> it = docs.entrySet().iterator();
            while (it.hasNext()) {
                writer.addDocument(it.next().getValue());
            }
            optimizeEL(writer);

        } catch (IOException e) {
            throw new SearchException(e);
        } finally {
            close(writer);
        }
        indexSpellCheck(id);
    }
    int removes = countBefore - countAfter;

    return new IndexResultImpl(removes, 0, 0);
}

From source file:lucee.runtime.search.lucene2.LuceneSearchCollection.java

License:Open Source License

/**
  * @param id/* w w  w .  jav  a  2 s . co  m*/
  * @param title
  * @param keyColumn
  * @param bodyColumns
  * @param language
  * @param custom1
  * @param custom2
  * @param custom3
  * @param custom4
  * @return 
  * @throws SearchException
  */
protected IndexResult _indexCustom(String id, Object title, QueryColumn keyColumn, QueryColumn[] bodyColumns,
        String language, Object urlpath, Object custom1, Object custom2, Object custom3, Object custom4)
        throws SearchException {
    _checkLanguage(language);
    String t;
    String url;
    String c1;
    String c2;
    String c3;
    String c4;

    int countExisting = 0;
    int countAdd = keyColumn.size();
    int countNew = 0;

    Map<String, Document> docs = new HashMap<String, Document>();
    IndexWriter writer = null;
    synchronized (token) {
        try {
            // read existing reader
            IndexReader reader = null;
            try {
                reader = _getReader(id, false);
                int len = reader.maxDoc();
                Document doc;
                for (int i = 0; i < len; i++) {
                    doc = reader.document(i);
                    docs.put(doc.getField("key").stringValue(), doc);
                }
            } catch (Exception e) {
            } finally {
                close(reader);
            }

            countExisting = docs.size();
            writer = _getWriter(id, true);
            int len = keyColumn.size();
            String key;
            for (int i = 1; i <= len; i++) {
                key = Caster.toString(keyColumn.get(i, null), null);
                if (key == null)
                    continue;

                StringBuilder body = new StringBuilder();
                for (int y = 0; y < bodyColumns.length; y++) {
                    Object tmp = bodyColumns[y].get(i, null);
                    if (tmp != null) {
                        body.append(tmp.toString());
                        body.append(' ');
                    }
                }
                //t=(title==null)?null:Caster.toString(title.get(i,null),null);
                //url=(urlpath==null)?null:Caster.toString(urlpath.get(i,null),null);

                t = getRow(title, i);
                url = getRow(urlpath, i);
                c1 = getRow(custom1, i);
                c2 = getRow(custom2, i);
                c3 = getRow(custom3, i);
                c4 = getRow(custom4, i);

                docs.put(key, CustomDocument.getDocument(t, key, body.toString(), url, c1, c2, c3, c4));
            }
            countNew = docs.size();
            Iterator<Entry<String, Document>> it = docs.entrySet().iterator();
            Entry<String, Document> entry;
            Document doc;
            while (it.hasNext()) {
                entry = it.next();
                doc = entry.getValue();
                writer.addDocument(doc);
            }
            optimizeEL(writer);
            //writer.optimize();

        } catch (IOException ioe) {
            throw new SearchException(ioe);
        } finally {
            close(writer);
        }
        indexSpellCheck(id);
    }
    int inserts = countNew - countExisting;

    return new IndexResultImpl(0, inserts, countAdd - inserts);
}

From source file:lucenesearch.Mallet.java

public void getMalletAllOutput() throws IOException {

    String index = new Searcher().getPostIndexPath();
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));

    PrintWriter pw = new PrintWriter("./data/mallet_all.txt");

    StringBuilder sb = new StringBuilder();

    for (int i = 0; i < reader.maxDoc(); i++) {
        Document doc = reader.document(i);
        System.out.println("Doc " + i);

        ArrayList<String> res = LuceneUtils.getAnalyzedRemoveHtml(doc.get("Body"));

        int id = Integer.parseInt(doc.get("SId"));
        sb = new StringBuilder();
        sb.append(id);/*  w ww  .ja v  a  2 s  .c o  m*/
        sb.append("\t");
        for (String re : res) {
            re = re.replaceAll("\r\n", " ").replaceAll("\n", " ").replaceAll("<.+?>", "").replaceAll(" +", " ")
                    .replaceAll("[^\\x00-\\x7F]", " ").trim();
            sb.append(re).append(" ");
        }
        sb.append("\n");
        pw.print(sb.toString());

    }
    pw.close();

}

From source file:lucenetools.TermData.java

License:Apache License

/**
 * Main application. /*  w ww.  j a  v a  2s  . co  m*/
 *
 * @param args the command line arguments
 */
public static void main(String[] args) {
    Options opts = new Options();
    CommandLine commandLine = new CommandLine();

    // if no command line options specified, user wants help
    if (0 == args.length) {
        commandLine.showHelp();
        System.exit(0);
    }

    // extract command line args and store in opts
    if (!commandLine.parse(args, opts))
        System.exit(1);

    if (opts.showHelp) {
        commandLine.showHelp();
        System.exit(0);
    }

    // validate all command line options
    if (!commandLine.isValid(opts))
        System.exit(1);

    // report all command line options to the user
    System.out.println("\nLuceneToMtx version " + VERSION + ".");
    commandLine.printOpts(opts);

    long maxMemory = Runtime.getRuntime().maxMemory() / 1024 / 1024;
    System.out.println("Java runtime max memory: " + maxMemory + " MB.");

    // Build a map and assign a dictionary index to each term.
    // Include only those terms that survive the min term freq cutoff.
    Map<String, Integer> dictMap = new TreeMap<>();

    File file = null;
    System.out.println("Processing index...");
    try {
        file = new File(opts.indexDir);
        IndexReader reader = DirectoryReader.open(FSDirectory.open(file));
        TermsEnum te = null;
        int nnz = 0, numCols = 0, maxDocs = reader.maxDoc();
        LinkedList<FeatureVector> matrixData = new LinkedList<>();

        // add other fields
        Collection<String> fields = new ArrayList<>();
        if (opts.fields > 0) {
            fields = MultiFields.getIndexedFields(reader);
            fields.remove(CONTENTSFIELD);
            fields.remove(PATHFIELD);
        }

        if (!extractTerms(reader, dictMap, opts.minTermFreq, maxDocs - 1, opts.maxTermPercentage))
            System.exit(1);

        // set of field names to extract
        Set<String> fieldSet = new HashSet<>();
        fieldSet.add(PATHFIELD);
        for (String s : fields) {
            fieldSet.add(s);
        }

        for (int i = 0; i < maxDocs; ++i) {
            // get term vector for next document
            Terms terms = reader.getTermVector(i, CONTENTSFIELD);
            if (terms == null)
                continue;

            te = terms.iterator(te);
            FeatureVector fv = new FeatureVector(numCols);

            int numEntries = buildFeatureVector(fv, te, dictMap);
            if (numEntries > 0) {
                // extract document path and save with FeatureVector
                Document doc = reader.document(i, fieldSet);
                fv.docPath = doc.get(PATHFIELD);

                // add any additional fields
                for (String s : fields) {
                    fv.fields.put(s, doc.get(s));
                }

                //System.out.println("processing document:" + fv.docPath);

                matrixData.add(fv);
                nnz += numEntries;
                ++numCols;
            }
        }

        // Sort the feature vectors by their document path field.  Write 
        // the matrix columns in this sorted order.
        Collections.sort(matrixData, new FeatureVectorComparator());
        File outdir = new File(opts.outDir);
        writeMatrixMarketFile(new File(outdir, MATRIXFILE), matrixData, dictMap.size(), numCols, nnz);
        System.out.println("Wrote " + MATRIXFILE + ".");
        writeDictionaryFile(new File(outdir, DICTFILE), dictMap);
        System.out.println("Wrote " + DICTFILE + ".");
        writeDocumentFile(new File(outdir, DOCFILE), matrixData);
        System.out.println("Wrote " + DOCFILE + ".");
        writeFieldFiles(outdir, fields, matrixData);
    } catch (IndexNotFoundException e) {
        if (null != file) {
            System.out.println("Lucene index not found in: " + file.getAbsolutePath());
        }
    } catch (IOException e) {
        System.out.println("LuceneToMtx exception: caught a " + e.getClass() + "\nMessage: " + e.getMessage());
    }
}