Example usage for org.apache.lucene.analysis CharArraySet EMPTY_SET

List of usage examples for org.apache.lucene.analysis CharArraySet EMPTY_SET

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet EMPTY_SET.

Prototype

CharArraySet EMPTY_SET

To view the source code for org.apache.lucene.analysis CharArraySet EMPTY_SET.

Click Source Link

Document

An empty CharArraySet .

Usage

From source file:RomanianAnalyzer.java

License:Apache License

/**
 * Builds an analyzer with the given stop words.
 * /*w  w  w  .j av  a2 s  . c  om*/
 * @param matchVersion lucene compatibility version
 * @param stopwords a stopword set
 */
public RomanianAnalyzer(Version matchVersion, Set<?> stopwords) {
    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}

From source file:ai.castor.idf.FetchTermIDF.java

License:Apache License

public double getTermIDF(String term) throws ParseException {
    Analyzer analyzer = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(FIELD_BODY, analyzer);
    ClassicSimilarity similarity = new ClassicSimilarity();

    String esTerm = qp.escape(term);
    double termIDF = 0.0;
    try {// w w w  . ja va2 s. c om
        TermQuery q = (TermQuery) qp.parse(esTerm);
        Term t = q.getTerm();
        termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());

        System.out.println(term + '\t' + esTerm + '\t' + q + '\t' + t + '\t' + termIDF);
    } catch (Exception e) {
        System.err.println("Exception in fetching IDF(" + term + "): " + e.toString());
    }
    return termIDF;
}

From source file:com.mozilla.grouperfish.lucene.analysis.en.EnglishAnalyzer.java

License:Apache License

/**
 * Builds an analyzer with the given stop words.
 * /*w  w  w.j a  v a  2  s  .co  m*/
 * @param matchVersion
 *            lucene compatibility version
 * @param stopwords
 *            a stopword set
 */
public EnglishAnalyzer(Version matchVersion, Set<?> stopwords, boolean stem) {
    this(matchVersion, stopwords, stem, CharArraySet.EMPTY_SET);
}

From source file:com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer.java

License:Apache License

public NGramEnglishAnalyzer(Version version, Set<?> stopwords, boolean stem, boolean outputUnigrams) {
    this(version, stopwords, stem, outputUnigrams, ShingleAllStopFilter.DEFAULT_MIN_SHINGLE_SIZE,
            ShingleAllStopFilter.DEFAULT_MAX_SHINGLE_SIZE, CharArraySet.EMPTY_SET);
}

From source file:com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer.java

License:Apache License

public NGramEnglishAnalyzer(Version version, Set<?> stopwords, boolean stem, boolean outputUnigrams,
        int minNGram, int maxNGram) {
    this(version, stopwords, stem, outputUnigrams, minNGram, maxNGram, CharArraySet.EMPTY_SET);
}

From source file:com.qwazr.search.bench.test.MultiField.PayloadAnalyzer.java

License:Apache License

@Override
final protected TokenStreamComponents createComponents(final String fieldName) {

    final Tokenizer tokenizer = new UAX29URLEmailTokenizer();
    // Read the payload from the first token
    final FirstTokenPayloadFilter firstTokenPayloadFilter = new FirstTokenPayloadFilter(tokenizer);
    TokenStream stream = new WordDelimiterGraphFilter(firstTokenPayloadFilter,
            WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
                    | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE
                    | WordDelimiterGraphFilter.CATENATE_ALL | WordDelimiterGraphFilter.CATENATE_NUMBERS
                    | WordDelimiterGraphFilter.CATENATE_WORDS | WordDelimiterGraphFilter.PRESERVE_ORIGINAL,
            CharArraySet.EMPTY_SET);
    stream = SmartAnalyzerSet.ascii(stream);
    // Set the payload to any token
    stream = firstTokenPayloadFilter.newSetter(stream);
    return new TokenStreamComponents(tokenizer, stream) {
        @Override/*w w w . j  ava2 s  . c o  m*/
        protected void setReader(final Reader reader) {
            super.setReader(reader);
        }
    };
}

From source file:edu.mit.ll.vizlincdb.document.FoldingSpanishAnalyzer.java

License:Apache License

/**
 * Builds an analyzer with the given stop words.
 *
 * @param matchVersion lucene compatibility version
 * @param stopwords a stopword set/*from  w ww. j a  v  a 2  s .  c o  m*/
 */
public FoldingSpanishAnalyzer(Version matchVersion, Set<?> stopwords) {
    this(matchVersion, stopwords, CharArraySet.EMPTY_SET);
}

From source file:io.anserini.index.IndexCollection.java

License:Apache License

public void run() throws IOException, InterruptedException {
    final long start = System.nanoTime();
    LOG.info("Starting indexer...");

    int numThreads = args.threads;

    final Directory dir = FSDirectory.open(indexPath);
    final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET)
            : new EnglishAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setSimilarity(new BM25Similarity());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    config.setRAMBufferSizeMB(args.memorybufferSize);
    config.setUseCompoundFile(false);//from  w  w  w  .j  a  va2 s . c  o m
    config.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, config);

    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    final List<Path> segmentPaths = collection.getFileSegmentPaths();

    final int segmentCnt = segmentPaths.size();
    LOG.info(segmentCnt + " files found in " + collectionPath.toString());
    for (int i = 0; i < segmentCnt; i++) {
        executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i)));
    }

    executor.shutdown();

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
            LOG.info(String.format("%.2f percent completed",
                    (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d));
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    if (segmentCnt != executor.getCompletedTaskCount()) {
        throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount =  "
                + executor.getCompletedTaskCount());
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (args.optimize)
            writer.forceMerge(1);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            // It is possible that this happens... but nothing much we can do at this point,
            // so just log the error and move on.
            LOG.error(e);
        }
    }

    LOG.info("Indexed documents: " + counters.indexedDocuments.get());
    LOG.info("Empty documents: " + counters.emptyDocuments.get());
    LOG.info("Errors: " + counters.errors.get());

    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info("Total " + numIndexed + " documents indexed in "
            + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}

From source file:io.anserini.index.IndexUtils.java

License:Apache License

public void printTermCounts(String termStr) throws IOException, ParseException {
    EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
    TermQuery q = (TermQuery) qp.parse(termStr);
    Term t = q.getTerm();/*from  w  w  w. j av  a 2  s .c  o m*/

    System.out.println("raw term:             " + termStr);
    System.out.println("stemmed term:         " + q.toString(LuceneDocumentGenerator.FIELD_BODY));
    System.out.println("collection frequency: " + reader.totalTermFreq(t));
    System.out.println("document frequency:   " + reader.docFreq(t));

    PostingsEnum postingsEnum = MultiFields.getTermDocsEnum(reader, LuceneDocumentGenerator.FIELD_BODY,
            t.bytes());
    System.out.println("postings:\n");
    while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        System.out.printf("\t%s, %s\n", postingsEnum.docID(), postingsEnum.freq());
    }
}

From source file:io.anserini.qa.passage.IdfPassageScorer.java

License:Apache License

@Override
public void score(String query, Map<String, Float> sentences) throws Exception {
    //    EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
    ClassicSimilarity similarity = new ClassicSimilarity();

    String escapedQuery = qp.escape(query);
    Query question = qp.parse(escapedQuery);
    HashSet<String> questionTerms = new HashSet<>(
            Arrays.asList(question.toString().trim().toLowerCase().split("\\s+")));

    // add the question terms to the termIDF Map
    for (String questionTerm : questionTerms) {
        try {/*from w ww.j a  va  2s . com*/
            TermQuery q = (TermQuery) qp.parse(questionTerm);
            Term t = q.getTerm();

            double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
            termIdfMap.put(questionTerm, String.valueOf(termIDF));
        } catch (Exception e) {
            continue;
        }
    }

    // avoid duplicate passages
    HashSet<String> seenSentences = new HashSet<>();

    for (Map.Entry<String, Float> sent : sentences.entrySet()) {
        double idf = 0.0;
        HashSet<String> seenTerms = new HashSet<>();

        String[] terms = sent.getKey().toLowerCase().split("\\s+");
        for (String term : terms) {
            try {
                TermQuery q = (TermQuery) qp.parse(term);
                Term t = q.getTerm();
                double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
                termIdfMap.put(term, String.valueOf(termIDF));

                if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
                    idf += termIDF;
                    seenTerms.add(t.toString());
                } else {
                    idf += 0.0;
                }
            } catch (Exception e) {
                continue;
            }
        }

        double weightedScore = idf + 0.0001 * sent.getValue();
        ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue());
        if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore())
                && !seenSentences.contains(sent)) {
            if (scoredPassageHeap.size() == topPassages) {
                scoredPassageHeap.pollLast();
            }
            scoredPassageHeap.add(scoredPassage);
            seenSentences.add(sent.getKey());
        }
    }
}