List of usage examples for org.apache.lucene.analysis CharArraySet EMPTY_SET
CharArraySet EMPTY_SET
To view the source code for org.apache.lucene.analysis CharArraySet EMPTY_SET.
Click Source Link
From source file:RomanianAnalyzer.java
License:Apache License
/** * Builds an analyzer with the given stop words. * /*w w w .j av a2 s . c om*/ * @param matchVersion lucene compatibility version * @param stopwords a stopword set */ public RomanianAnalyzer(Version matchVersion, Set<?> stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); }
From source file:ai.castor.idf.FetchTermIDF.java
License:Apache License
public double getTermIDF(String term) throws ParseException { Analyzer analyzer = new EnglishAnalyzer(CharArraySet.EMPTY_SET); QueryParser qp = new QueryParser(FIELD_BODY, analyzer); ClassicSimilarity similarity = new ClassicSimilarity(); String esTerm = qp.escape(term); double termIDF = 0.0; try {// w w w . ja va2 s. c om TermQuery q = (TermQuery) qp.parse(esTerm); Term t = q.getTerm(); termIDF = similarity.idf(reader.docFreq(t), reader.numDocs()); System.out.println(term + '\t' + esTerm + '\t' + q + '\t' + t + '\t' + termIDF); } catch (Exception e) { System.err.println("Exception in fetching IDF(" + term + "): " + e.toString()); } return termIDF; }
From source file:com.mozilla.grouperfish.lucene.analysis.en.EnglishAnalyzer.java
License:Apache License
/** * Builds an analyzer with the given stop words. * /*w w w.j a v a 2 s .co m*/ * @param matchVersion * lucene compatibility version * @param stopwords * a stopword set */ public EnglishAnalyzer(Version matchVersion, Set<?> stopwords, boolean stem) { this(matchVersion, stopwords, stem, CharArraySet.EMPTY_SET); }
From source file:com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer.java
License:Apache License
public NGramEnglishAnalyzer(Version version, Set<?> stopwords, boolean stem, boolean outputUnigrams) { this(version, stopwords, stem, outputUnigrams, ShingleAllStopFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleAllStopFilter.DEFAULT_MAX_SHINGLE_SIZE, CharArraySet.EMPTY_SET); }
From source file:com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer.java
License:Apache License
public NGramEnglishAnalyzer(Version version, Set<?> stopwords, boolean stem, boolean outputUnigrams, int minNGram, int maxNGram) { this(version, stopwords, stem, outputUnigrams, minNGram, maxNGram, CharArraySet.EMPTY_SET); }
From source file:com.qwazr.search.bench.test.MultiField.PayloadAnalyzer.java
License:Apache License
@Override final protected TokenStreamComponents createComponents(final String fieldName) { final Tokenizer tokenizer = new UAX29URLEmailTokenizer(); // Read the payload from the first token final FirstTokenPayloadFilter firstTokenPayloadFilter = new FirstTokenPayloadFilter(tokenizer); TokenStream stream = new WordDelimiterGraphFilter(firstTokenPayloadFilter, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.CATENATE_ALL | WordDelimiterGraphFilter.CATENATE_NUMBERS | WordDelimiterGraphFilter.CATENATE_WORDS | WordDelimiterGraphFilter.PRESERVE_ORIGINAL, CharArraySet.EMPTY_SET); stream = SmartAnalyzerSet.ascii(stream); // Set the payload to any token stream = firstTokenPayloadFilter.newSetter(stream); return new TokenStreamComponents(tokenizer, stream) { @Override/*w w w . j ava2 s . c o m*/ protected void setReader(final Reader reader) { super.setReader(reader); } }; }
From source file:edu.mit.ll.vizlincdb.document.FoldingSpanishAnalyzer.java
License:Apache License
/** * Builds an analyzer with the given stop words. * * @param matchVersion lucene compatibility version * @param stopwords a stopword set/*from w ww. j a v a 2 s . c o m*/ */ public FoldingSpanishAnalyzer(Version matchVersion, Set<?> stopwords) { this(matchVersion, stopwords, CharArraySet.EMPTY_SET); }
From source file:io.anserini.index.IndexCollection.java
License:Apache License
public void run() throws IOException, InterruptedException { final long start = System.nanoTime(); LOG.info("Starting indexer..."); int numThreads = args.threads; final Directory dir = FSDirectory.open(indexPath); final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer(); final IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setSimilarity(new BM25Similarity()); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setRAMBufferSizeMB(args.memorybufferSize); config.setUseCompoundFile(false);//from w w w .j a va2 s . c o m config.setMergeScheduler(new ConcurrentMergeScheduler()); final IndexWriter writer = new IndexWriter(dir, config); final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); final List<Path> segmentPaths = collection.getFileSegmentPaths(); final int segmentCnt = segmentPaths.size(); LOG.info(segmentCnt + " files found in " + collectionPath.toString()); for (int i = 0; i < segmentCnt; i++) { executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i))); } executor.shutdown(); try { // Wait for existing tasks to terminate while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { LOG.info(String.format("%.2f percent completed", (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d)); } } catch (InterruptedException ie) { // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } if (segmentCnt != executor.getCompletedTaskCount()) { throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); } int numIndexed = writer.maxDoc(); try { writer.commit(); if (args.optimize) writer.forceMerge(1); } finally { try { writer.close(); } catch (IOException e) { // It is possible that this happens... but nothing much we can do at this point, // so just log the error and move on. LOG.error(e); } } LOG.info("Indexed documents: " + counters.indexedDocuments.get()); LOG.info("Empty documents: " + counters.emptyDocuments.get()); LOG.info("Errors: " + counters.errors.get()); final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); }
From source file:io.anserini.index.IndexUtils.java
License:Apache License
public void printTermCounts(String termStr) throws IOException, ParseException { EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET); QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea); TermQuery q = (TermQuery) qp.parse(termStr); Term t = q.getTerm();/*from w w w. j av a 2 s .c o m*/ System.out.println("raw term: " + termStr); System.out.println("stemmed term: " + q.toString(LuceneDocumentGenerator.FIELD_BODY)); System.out.println("collection frequency: " + reader.totalTermFreq(t)); System.out.println("document frequency: " + reader.docFreq(t)); PostingsEnum postingsEnum = MultiFields.getTermDocsEnum(reader, LuceneDocumentGenerator.FIELD_BODY, t.bytes()); System.out.println("postings:\n"); while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { System.out.printf("\t%s, %s\n", postingsEnum.docID(), postingsEnum.freq()); } }
From source file:io.anserini.qa.passage.IdfPassageScorer.java
License:Apache License
@Override public void score(String query, Map<String, Float> sentences) throws Exception { // EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords)); EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET); QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea); ClassicSimilarity similarity = new ClassicSimilarity(); String escapedQuery = qp.escape(query); Query question = qp.parse(escapedQuery); HashSet<String> questionTerms = new HashSet<>( Arrays.asList(question.toString().trim().toLowerCase().split("\\s+"))); // add the question terms to the termIDF Map for (String questionTerm : questionTerms) { try {/*from w ww.j a va 2s . com*/ TermQuery q = (TermQuery) qp.parse(questionTerm); Term t = q.getTerm(); double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs()); termIdfMap.put(questionTerm, String.valueOf(termIDF)); } catch (Exception e) { continue; } } // avoid duplicate passages HashSet<String> seenSentences = new HashSet<>(); for (Map.Entry<String, Float> sent : sentences.entrySet()) { double idf = 0.0; HashSet<String> seenTerms = new HashSet<>(); String[] terms = sent.getKey().toLowerCase().split("\\s+"); for (String term : terms) { try { TermQuery q = (TermQuery) qp.parse(term); Term t = q.getTerm(); double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs()); termIdfMap.put(term, String.valueOf(termIDF)); if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) { idf += termIDF; seenTerms.add(t.toString()); } else { idf += 0.0; } } catch (Exception e) { continue; } } double weightedScore = idf + 0.0001 * sent.getValue(); ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue()); if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore()) && !seenSentences.contains(sent)) { if (scoredPassageHeap.size() == topPassages) { scoredPassageHeap.pollLast(); } scoredPassageHeap.add(scoredPassage); seenSentences.add(sent.getKey()); } } }