Example usage for org.apache.lucene.index IndexReader maxDoc

List of usage examples for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:com.github.tteofili.looseen.MinHashClassifier.java

License:Apache License

public MinHashClassifier(IndexReader reader, String textField, String categoryField, int min, int hashCount,
        int hashSize) {
    this.min = min;
    this.hashCount = hashCount;
    this.hashSize = hashSize;
    try {/* w  ww. j a va 2 s.  co m*/
        Analyzer analyzer = createMinHashAnalyzer(min, hashCount, hashSize);
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        directory = new RAMDirectory();
        IndexWriter writer = new IndexWriter(directory, config);
        for (int i = 0; i < reader.maxDoc(); i++) {
            Document document = new Document();
            Document d = reader.document(i);
            String textValue = d.getField(textField).stringValue();
            String categoryValue = d.getField(categoryField).stringValue();
            document.add(new TextField(TEXT_FIELD, textValue, Field.Store.NO));
            document.add(new StringField(CLASS_FIELD, categoryValue, Field.Store.YES));
            writer.addDocument(document);
        }
        writer.commit();
        writer.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE);
}

From source file:com.github.tteofili.looseen.Test20NewsgroupsClassification.java

License:Apache License

@Test
public void test20Newsgroups() throws Exception {

    String indexProperty = System.getProperty("index");
    if (indexProperty != null) {
        try {/*from w w  w .  j  a va 2 s  .  c om*/
            index = Boolean.valueOf(indexProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    String splitProperty = System.getProperty("split");
    if (splitProperty != null) {
        try {
            split = Boolean.valueOf(splitProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    Path mainIndexPath = Paths.get(INDEX + "/original");
    Directory directory = FSDirectory.open(mainIndexPath);
    Path trainPath = Paths.get(INDEX + "/train");
    Path testPath = Paths.get(INDEX + "/test");
    Path cvPath = Paths.get(INDEX + "/cv");
    FSDirectory cv = null;
    FSDirectory test = null;
    FSDirectory train = null;
    IndexReader testReader = null;
    if (split) {
        cv = FSDirectory.open(cvPath);
        test = FSDirectory.open(testPath);
        train = FSDirectory.open(trainPath);
    }

    if (index) {
        delete(mainIndexPath);
        if (split) {
            delete(trainPath, testPath, cvPath);
        }
    }

    IndexReader reader = null;
    List<Classifier<BytesRef>> classifiers = new LinkedList<>();
    try {
        Analyzer analyzer = new StandardAnalyzer();
        if (index) {

            System.out.format("Indexing 20 Newsgroups...%n");

            long startIndex = System.currentTimeMillis();
            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer));

            buildIndex(new File(PREFIX + "/20n/20_newsgroups"), indexWriter);

            long endIndex = System.currentTimeMillis();
            System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(),
                    (endIndex - startIndex) / 1000);

            indexWriter.close();

        }

        if (split && !index) {
            reader = DirectoryReader.open(train);
        } else {
            reader = DirectoryReader.open(directory);
        }

        if (index && split) {
            // split the index
            System.out.format("Splitting the index...%n");

            long startSplit = System.currentTimeMillis();
            DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0);
            datasetSplitter.split(reader, train, test, cv, analyzer, false, CATEGORY_FIELD, BODY_FIELD,
                    SUBJECT_FIELD, CATEGORY_FIELD);
            reader.close();
            reader = DirectoryReader.open(train); // using the train index from now on
            long endSplit = System.currentTimeMillis();
            System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000);
        }

        final long startTime = System.currentTimeMillis();

        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD,
                BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new AxiomaticF1EXP(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new AxiomaticF1LOG(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3,
                1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer,
                null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 3, 1, 1, CATEGORY_FIELD,
                BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()),
                analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 15, 1, 100));
        classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 30, 3, 300));
        classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 10, 1, 100));
        classifiers.add(new KNearestFuzzyClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer, null,
                1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers
                .add(new KNearestFuzzyClassifier(reader, null, analyzer, null, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers
                .add(new KNearestFuzzyClassifier(reader, null, analyzer, null, 3, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new AxiomaticF1EXP(), analyzer, null, 3,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new AxiomaticF1LOG(), analyzer, null, 3,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD));

        int maxdoc;

        if (split) {
            testReader = DirectoryReader.open(test);
            maxdoc = testReader.maxDoc();
        } else {
            maxdoc = reader.maxDoc();
        }

        System.out.format("Starting evaluation on %d docs...%n", maxdoc);

        ExecutorService service = Executors.newCachedThreadPool();
        List<Future<String>> futures = new LinkedList<>();
        for (Classifier<BytesRef> classifier : classifiers) {
            testClassifier(reader, startTime, testReader, service, futures, classifier);
        }
        for (Future<String> f : futures) {
            System.out.println(f.get());
        }

        Thread.sleep(10000);
        service.shutdown();

    } finally {
        if (reader != null) {
            reader.close();
        }
        directory.close();
        if (test != null) {
            test.close();
        }
        if (train != null) {
            train.close();
        }
        if (cv != null) {
            cv.close();
        }
        if (testReader != null) {
            testReader.close();
        }

        for (Classifier c : classifiers) {
            if (c instanceof Closeable) {
                ((Closeable) c).close();
            }
        }
    }
}

From source file:com.github.tteofili.looseen.TestWikipediaClassification.java

License:Apache License

@Test
public void testItalianWikipedia() throws Exception {

    String indexProperty = System.getProperty("index");
    if (indexProperty != null) {
        try {/*  w w  w .j  av a  2s.  c om*/
            index = Boolean.valueOf(indexProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    String splitProperty = System.getProperty("split");
    if (splitProperty != null) {
        try {
            split = Boolean.valueOf(splitProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    Path mainIndexPath = Paths.get(INDEX + "/original");
    Directory directory = FSDirectory.open(mainIndexPath);
    Path trainPath = Paths.get(INDEX + "/train");
    Path testPath = Paths.get(INDEX + "/test");
    Path cvPath = Paths.get(INDEX + "/cv");
    FSDirectory cv = null;
    FSDirectory test = null;
    FSDirectory train = null;
    DirectoryReader testReader = null;
    if (split) {
        cv = FSDirectory.open(cvPath);
        test = FSDirectory.open(testPath);
        train = FSDirectory.open(trainPath);
    }

    if (index) {
        delete(mainIndexPath);
        if (split) {
            delete(trainPath, testPath, cvPath);
        }
    }

    IndexReader reader = null;
    try {
        Collection<String> stopWordsList = Arrays.asList("di", "a", "da", "in", "per", "tra", "fra", "il", "lo",
                "la", "i", "gli", "le");
        CharArraySet stopWords = new CharArraySet(stopWordsList, true);
        Analyzer analyzer = new ItalianAnalyzer(stopWords);
        if (index) {

            System.out.format("Indexing Italian Wikipedia...%n");

            long startIndex = System.currentTimeMillis();
            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer));

            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current1.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current2.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current3.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current4.xml"), indexWriter);

            long endIndex = System.currentTimeMillis();
            System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(),
                    (endIndex - startIndex) / 1000);

            indexWriter.close();

        }

        if (split && !index) {
            reader = DirectoryReader.open(train);
        } else {
            reader = DirectoryReader.open(directory);
        }

        if (index && split) {
            // split the index
            System.out.format("Splitting the index...%n");

            long startSplit = System.currentTimeMillis();
            DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0);
            for (LeafReaderContext context : reader.leaves()) {
                datasetSplitter.split(context.reader(), train, test, cv, analyzer, false, CATEGORY_FIELD,
                        TEXT_FIELD, CATEGORY_FIELD);
            }
            reader.close();
            reader = DirectoryReader.open(train); // using the train index from now on
            long endSplit = System.currentTimeMillis();
            System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000);
        }

        final long startTime = System.currentTimeMillis();

        List<Classifier<BytesRef>> classifiers = new LinkedList<>();
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new BM25Similarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD,
                TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3,
                1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer,
                null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 1, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()),
                analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 10, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 300));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 3, 100));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));

        int maxdoc;

        if (split) {
            testReader = DirectoryReader.open(test);
            maxdoc = testReader.maxDoc();
        } else {
            maxdoc = reader.maxDoc();
        }

        System.out.format("Starting evaluation on %d docs...%n", maxdoc);

        ExecutorService service = Executors.newCachedThreadPool();
        List<Future<String>> futures = new LinkedList<>();
        for (Classifier<BytesRef> classifier : classifiers) {

            final IndexReader finalReader = reader;
            final DirectoryReader finalTestReader = testReader;
            futures.add(service.submit(() -> {
                ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix;
                if (split) {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalTestReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                } else {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                }

                final long endTime = System.currentTimeMillis();
                final int elapse = (int) (endTime - startTime) / 1000;

                return " * " + classifier + " \n    * accuracy = " + confusionMatrix.getAccuracy()
                        + "\n    * precision = " + confusionMatrix.getPrecision() + "\n    * recall = "
                        + confusionMatrix.getRecall() + "\n    * f1-measure = " + confusionMatrix.getF1Measure()
                        + "\n    * avgClassificationTime = " + confusionMatrix.getAvgClassificationTime()
                        + "\n    * time = " + elapse + " (sec)\n ";
            }));

        }
        for (Future<String> f : futures) {
            System.out.println(f.get());
        }

        Thread.sleep(10000);
        service.shutdown();

    } finally {
        try {
            if (reader != null) {
                reader.close();
            }
            if (directory != null) {
                directory.close();
            }
            if (test != null) {
                test.close();
            }
            if (train != null) {
                train.close();
            }
            if (cv != null) {
                cv.close();
            }
            if (testReader != null) {
                testReader.close();
            }
        } catch (Throwable e) {
            e.printStackTrace();
        }
    }
}

From source file:com.globalsight.ling.lucene.analysis.ChainedFilter.java

License:Apache License

/**
 * Delegates to each filter in the chain.
 * @param reader IndexReader/*from  w w w .  j  a v  a2s  .c  om*/
 * @param logic Logical operation
 * @return BitSet
 */
private BitSet bits(IndexReader reader, int logic) throws IOException {
    BitSet result;
    int i = 0;

    // First AND operation takes place against a completely false
    // bitset and will always return zero results. Thanks to
    // Daniel Armbrust for pointing this out and suggesting workaround.
    if (logic == AND) {
        result = new BitSet();
        //result = (BitSet)chain[i].bits(reader).clone();
        ++i;
    } else {
        result = new BitSet(reader.maxDoc());
    }

    for (; i < chain.length; i++) {
        doChain(result, reader, logic, chain[i]);
    }
    return result;
}

From source file:com.globalsight.ling.lucene.analysis.ChainedFilter.java

License:Apache License

/**
 * Delegates to each filter in the chain.
 * @param reader IndexReader//from  ww  w.  j  av  a2s. c o m
 * @param logic Logical operation
 * @return BitSet
 */
private BitSet bits(IndexReader reader, int[] logic) throws IOException {
    if (logic.length != chain.length) {
        throw new IllegalArgumentException("Invalid number of elements in logic array");
    }

    BitSet result;
    int i = 0;

    // First AND operation takes place against a completely false
    // bitset and will always return zero results. Thanks to
    // Daniel Armbrust for pointing this out and suggesting workaround.
    if (logic[0] == AND) {
        result = new BitSet();
        //result = (BitSet)chain[i].bits(reader).clone();
        ++i;
    } else {
        result = new BitSet(reader.maxDoc());
    }

    for (; i < chain.length; i++) {
        doChain(result, reader, logic[i], chain[i]);
    }

    return result;
}

From source file:com.greplin.lucene.filter.PhraseFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
    List<IndexReader> subReaders = IndexReaders.gatherSubReaders(reader);
    PhraseFilterMatchList[] results = new PhraseFilterMatchList[subReaders.size()];
    int matchCount = 0;
    int readerNumber = 0;

    for (IndexReader subReader : subReaders) {
        SortedSet<TermWithFrequency> termsOrderedByFrequency = Sets.newTreeSet();
        for (int i = 0; i < this.terms.length; i++) {
            Term t = this.terms[i];
            termsOrderedByFrequency.add(new TermWithFrequency(t, subReader.docFreq(t), i));
        }/*from  w w w  .  j  a va  2s .c o m*/

        PhraseFilterMatchList matches = null;
        TermPositions termPositions = subReader.termPositions();
        try {
            for (TermWithFrequency term : termsOrderedByFrequency) {
                if (term.docFreq == 0) {
                    break;
                }

                termPositions.seek(term.term);

                if (matches == null) {
                    // If this is the first term, collect all matches that intersect
                    // with the provided initial document set.
                    Intersection intersection = this.intersectionProvider.get(reader);

                    matches = new PhraseFilterMatchList(term.docFreq);
                    while (intersection.advanceToNextIntersection(termPositions)) {
                        int freq = termPositions.freq();
                        PhraseFilterIntList list = new PhraseFilterIntList(freq);
                        for (int i = 0; i < freq; i++) {
                            list.add(termPositions.nextPosition() - term.offset);
                        }
                        matches.add(termPositions.doc(), list);
                    }
                } else {
                    // Otherwise, intersect with the existing matches.
                    matches.intersect(termPositions, term.offset);
                }

                if (matches.getCount() == 0) {
                    break;
                }
            }
        } finally {
            termPositions.close();
        }

        if (matches != null) {
            results[readerNumber] = matches;
            matchCount += matches.getCount();
        }
        readerNumber++;
    }

    final int bitsPerIntPowerLogTwo = 5; // 2^5 = 32
    if (matchCount > reader.maxDoc() >> bitsPerIntPowerLogTwo) {
        FixedBitSet result = new FixedBitSet(reader.maxDoc());
        int readerOffset = 0;
        for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
            PhraseFilterMatchList matches = results[readerIndex];
            if (matches != null) {
                int count = matches.getCount();
                int[] docIds = matches.getDocIds();
                for (int i = 0; i < count; i++) {
                    result.set(docIds[i] + readerOffset);
                }
            }
            readerOffset += subReaders.get(readerIndex).maxDoc();
        }
        return result;
    } else if (matchCount == 0) {
        return DocIdSets.EMPTY;
    } else {
        int[] result = new int[matchCount];
        int base = 0;
        int readerOffset = 0;
        for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
            PhraseFilterMatchList matches = results[readerIndex];
            if (matches != null) {
                int count = matches.getCount();
                int[] docIds = matches.getDocIds();
                for (int i = 0; i < count; i++) {
                    result[base + i] = docIds[i] + readerOffset;
                }
                base += count;
            }
            readerOffset += subReaders.get(readerIndex).maxDoc();
        }
        return new SortedIntArrayDocIdSet(result);
    }
}

From source file:com.greplin.lucene.filter.TermsFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
    FixedBitSet result = new FixedBitSet(reader.maxDoc());
    TermDocs td = reader.termDocs();/* w  ww . ja va 2 s . c  o m*/
    try {
        for (Term term : this.terms) {
            td.seek(term);
            while (td.next()) {
                result.set(td.doc());
            }
        }
    } finally {
        td.close();
    }
    return result;
}

From source file:com.ideabase.repository.core.index.ExtendedRangeFilter.java

License:Open Source License

@Override
public BitSet bits(IndexReader reader) throws IOException {
    final BitSet bits = new BitSet(reader.maxDoc());
    final TermEnum enumerator = (mIncludeLower ? reader.terms(new Term(mFieldName, mLowerTerm))
            : reader.terms(new Term(mFieldName, "")));
    try {/*from  w w w.  j  a  va  2s .c  o  m*/
        if (enumerator.term() == null) {
            return bits;
        }

        boolean checkLower = mIncludeLower;
        final TermDocs termDocs = reader.termDocs();
        try {
            do {
                final Term currentTerm = enumerator.term();
                final boolean sameField = (currentTerm != null && currentTerm.field().equals(mFieldName));
                if (sameField) {
                    Long valueNumber = 0L;
                    String valueText = currentTerm.text();
                    final boolean noLowerBound = !checkLower || mLowerTerm == null;
                    final int compareResult;
                    if (mNumberField) {
                        valueNumber = Long.valueOf(valueText);
                        compareResult = valueNumber.compareTo(mLowerTermNumber);
                    } else {
                        compareResult = valueText.compareTo(mLowerTerm);
                    }
                    if (noLowerBound || compareResult > 0) {
                        checkLower = false;
                        if (mUpperTerm != null) {
                            final int compare;
                            if (mNumberField) {
                                compare = mUpperTermNumber.compareTo(valueNumber);
                            } else {
                                compare = mUpperTerm.compareTo(valueText);
                            }
                            /*
                             * if beyond the upper term, or is exclusive and
                             * this is equal to the upper term, break out
                             */
                            if ((compare < 0) || (!mIncludeUpper && compare == 0)) {
                                break;
                            }
                        }
                        /* we have a good term, find the docs */
                        termDocs.seek(enumerator.term());
                        while (termDocs.next()) {
                            bits.set(termDocs.doc());
                        }
                    }
                } else {
                    break;
                }
            } while (enumerator.next());

        } finally {
            termDocs.close();
        }
    } finally {
        enumerator.close();
    }

    return bits;
}

From source file:com.ikon.module.db.stuff.IndexHelper.java

License:Open Source License

public void checkIndexOnStartup() {
    //log.info("Observed event {1} from Thread {0}", Thread.currentThread().getName(), App.INIT_SUCCESS);

    // See if we need to rebuild the index during startup ...
    FullTextEntityManager ftEm = Search.getFullTextEntityManager(entityManager);
    SearchFactory searchFactory = ftEm.getSearchFactory();
    ReaderProvider readerProvider = searchFactory.getReaderProvider();
    IndexReader reader = readerProvider
            .openReader(searchFactory.getDirectoryProviders(NodeDocumentVersion.class)[0]);
    int maxDoc = 0;

    try {//from  w w w.j a  va2 s. c o  m
        maxDoc = reader.maxDoc();
    } finally {
        readerProvider.closeReader(reader);
    }

    if (maxDoc == 0) {
        log.warn("No objects indexed ... rebuilding Lucene search index from database ...");
        long _exit = 0L;
        long _entr = System.currentTimeMillis();

        try {
            int docs = doRebuildIndex();
            _exit = System.currentTimeMillis();
            log.info("Took " + (_exit - _entr) + " (ms) to re-build the index containing " + docs
                    + " documents.");
        } catch (Exception exc) {
            if (exc instanceof RuntimeException) {
                throw (RuntimeException) exc;
            } else {
                throw new RuntimeException(exc);
            }
        }

        // build the spell checker index off of the HS index.
        buildSpellCheckerIndex(searchFactory);
    }
}

From source file:com.ikon.servlet.admin.ListIndexesServlet.java

License:Open Source License

/**
 * List Lucene indexes// w  ww. j  a  v  a  2s . c  om
 */
@SuppressWarnings("unchecked")
private void showLuceneDocument(HttpServletRequest request, HttpServletResponse response)
        throws ServletException, IOException {
    boolean showTerms = WebUtils.getBoolean(request, "showTerms");
    int id = WebUtils.getInt(request, "id", 0);
    FullTextSession ftSession = null;
    ReaderProvider rProv = null;
    Session session = null;
    IndexReader idx = null;
    List<Map<String, String>> fields = new ArrayList<Map<String, String>>();

    try {
        session = HibernateUtil.getSessionFactory().openSession();
        ftSession = Search.getFullTextSession(session);
        SearchFactory sFactory = ftSession.getSearchFactory();
        rProv = sFactory.getReaderProvider();

        DirectoryProvider<Directory>[] dirProv = sFactory.getDirectoryProviders(NodeDocument.class);
        idx = rProv.openReader(dirProv[0]);

        // Print Lucene documents
        if (!idx.isDeleted(id)) {
            Document doc = idx.document(id);
            String hibClass = null;

            for (Fieldable fld : doc.getFields()) {
                Map<String, String> field = new HashMap<String, String>();
                field.put("name", fld.name());
                field.put("value", fld.stringValue());
                fields.add(field);

                if (fld.name().equals("_hibernate_class")) {
                    hibClass = fld.stringValue();
                }
            }

            /**
             * 1) Get all the terms using indexReader.terms()
             * 2) Process the term only if it belongs to the target field.
             * 3) Get all the docs using indexReader.termDocs(term);
             * 4) So, we have the term-doc pairs at this point.
             */
            if (showTerms && NodeDocument.class.getCanonicalName().equals(hibClass)) {
                List<String> terms = new ArrayList<String>();

                for (TermEnum te = idx.terms(); te.next();) {
                    Term t = te.term();

                    if ("text".equals(t.field())) {
                        for (TermDocs tds = idx.termDocs(t); tds.next();) {
                            if (id == tds.doc()) {
                                terms.add(t.text());
                            }
                        }
                    }
                }

                Map<String, String> field = new HashMap<String, String>();
                field.put("name", "terms");
                field.put("value", terms.toString());
                fields.add(field);
            }
        }

        ServletContext sc = getServletContext();
        sc.setAttribute("fields", fields);
        sc.setAttribute("id", id);
        sc.setAttribute("max", idx.maxDoc() - 1);
        sc.setAttribute("prev", id > 0);
        sc.setAttribute("next", id < idx.maxDoc() - 1);
        sc.setAttribute("showTerms", showTerms);
        sc.getRequestDispatcher("/admin/list_indexes.jsp").forward(request, response);
    } finally {
        if (rProv != null && idx != null) {
            rProv.closeReader(idx);
        }

        HibernateUtil.close(ftSession);
        HibernateUtil.close(session);
    }
}