List of usage examples for org.apache.lucene.analysis CharArraySet EMPTY_SET
CharArraySet EMPTY_SET
To view the source code for org.apache.lucene.analysis CharArraySet EMPTY_SET.
Click Source Link
From source file:io.anserini.qa.passage.IdfPassageScorer.java
License:Apache License
@Override public JSONObject getTermIdfJSON(List<String> sentList) { // EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords)); EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET); QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea); ClassicSimilarity similarity = new ClassicSimilarity(); for (String sent : sentList) { String[] thisSentence = sent.trim().split("\\s+"); for (String term : thisSentence) { try { TermQuery q = (TermQuery) qp.parse(term); Term t = q.getTerm();//from w w w. j a v a 2 s.c om double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs()); termIdfMap.put(term, String.valueOf(termIDF)); } catch (Exception e) { continue; } } } return new JSONObject(termIdfMap); }
From source file:io.anserini.search.SearchCollection.java
License:Apache License
public SearchCollection(SearchArgs args) throws IOException { this.args = args; Path indexPath = Paths.get(args.index); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException(args.index + " does not exist or is not a directory."); }/*from w w w . j av a 2 s .c om*/ LOG.info("Reading index at " + args.index); this.reader = DirectoryReader.open(FSDirectory.open(indexPath)); // Figure out which scoring model to use. if (args.ql) { LOG.info("Using QL scoring model"); this.similarity = new LMDirichletSimilarity(args.mu); } else if (args.bm25) { LOG.info("Using BM25 scoring model"); this.similarity = new BM25Similarity(args.k1, args.b); } else if (args.f2log) { LOG.info("Using F2Log scoring model"); this.similarity = new F2LogSimilarity(args.f2log_s); } else { throw new IllegalArgumentException("Error: Must specify scoring model!"); } // Are we searching tweets? if (args.searchtweets) { analyzer = new TweetAnalyzer(); } else { analyzer = args.keepstop ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer(); } isRerank = args.rm3 || args.axiom; // Set up the ranking cascade. cascade = new RerankerCascade(); if (args.rm3) { cascade.add(new Rm3Reranker(analyzer, FIELD_BODY, args)); } else if (args.axiom) { cascade.add(new AxiomReranker(FIELD_BODY, args)); } cascade.add(new ScoreTiesAdjusterReranker()); }
From source file:org.apache.carbondata.datamap.lucene.LuceneDataMapWriter.java
License:Apache License
/** * Start of new blocklet notification./*from ww w .ja va 2s . c o m*/ */ public void onBlockletStart(int blockletId) throws IOException { if (null == analyzer) { if (CarbonProperties.getInstance() .getProperty(CarbonCommonConstants.CARBON_LUCENE_INDEX_STOP_WORDS, CarbonCommonConstants.CARBON_LUCENE_INDEX_STOP_WORDS_DEFAULT) .equalsIgnoreCase("true")) { analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET); } else { analyzer = new StandardAnalyzer(); } } // save index data into ram, write into disk after one page finished ramDir = new RAMDirectory(); ramIndexWriter = new IndexWriter(ramDir, new IndexWriterConfig(analyzer)); if (indexWriter != null) { return; } // get index path, put index data into segment's path String dataMapPath; if (storeBlockletWise) { dataMapPath = this.dataMapPath + File.separator + blockletId; } else { dataMapPath = this.dataMapPath; } Path indexPath = FileFactory.getPath(dataMapPath); FileSystem fs = FileFactory.getFileSystem(indexPath); // if index path not exists, create it if (!fs.exists(indexPath)) { if (!fs.mkdirs(indexPath)) { throw new IOException("Failed to create directory " + dataMapPath); } } // the indexWriter closes the FileSystem on closing the writer, so for a new configuration // and disable the cache for the index writer, it will be closed on closing the writer Configuration conf = FileFactory.getConfiguration(); conf.set("fs.hdfs.impl.disable.cache", "true"); // create a index writer Directory indexDir = new HdfsDirectory(indexPath, conf); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); if (CarbonProperties.getInstance() .getProperty(CarbonCommonConstants.CARBON_LUCENE_COMPRESSION_MODE, CarbonCommonConstants.CARBON_LUCENE_COMPRESSION_MODE_DEFAULT) .equalsIgnoreCase(CarbonCommonConstants.CARBON_LUCENE_COMPRESSION_MODE_DEFAULT)) { indexWriterConfig.setCodec(speedCodec); } else { indexWriterConfig.setCodec(compressionCodec); } indexWriter = new IndexWriter(indexDir, indexWriterConfig); }
From source file:org.elasticsearch.analysis.common.ArabicAnalyzerProvider.java
License:Apache License
ArabicAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); arabicAnalyzer = new ArabicAnalyzer( Analysis.parseStopWords(env, settings, ArabicAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); arabicAnalyzer.setVersion(version);/* w ww .j a v a2 s.co m*/ }
From source file:org.elasticsearch.analysis.common.ArmenianAnalyzerProvider.java
License:Apache License
ArmenianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new ArmenianAnalyzer( Analysis.parseStopWords(env, settings, ArmenianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version);//from w w w. ja v a 2 s .co m }
From source file:org.elasticsearch.analysis.common.BasqueAnalyzerProvider.java
License:Apache License
BasqueAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new BasqueAnalyzer(Analysis.parseStopWords(env, settings, BasqueAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version);//from www. jav a 2 s .c om }
From source file:org.elasticsearch.analysis.common.BengaliAnalyzerProvider.java
License:Apache License
BengaliAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new BengaliAnalyzer(Analysis.parseStopWords(env, settings, BengaliAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version);/*from w ww. j a va2 s . c o m*/ }
From source file:org.elasticsearch.analysis.common.BrazilianAnalyzerProvider.java
License:Apache License
BrazilianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new BrazilianAnalyzer( Analysis.parseStopWords(env, settings, BrazilianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version);//from www . j a v a2 s. c om }
From source file:org.elasticsearch.analysis.common.BrazilianStemTokenFilterFactory.java
License:Apache License
BrazilianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {/*from w w w. j a va 2 s . co m*/ super(indexSettings, name, settings); this.exclusions = Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET); }
From source file:org.elasticsearch.analysis.common.BulgarianAnalyzerProvider.java
License:Apache License
BulgarianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new BulgarianAnalyzer( Analysis.parseStopWords(env, settings, BulgarianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version);/*from w ww.j a v a 2s. c o m*/ }