Example usage for org.apache.lucene.analysis CharArraySet EMPTY_SET

List of usage examples for org.apache.lucene.analysis CharArraySet EMPTY_SET

Introduction

In this page you can find the example usage for org.apache.lucene.analysis CharArraySet EMPTY_SET.

Prototype

CharArraySet EMPTY_SET

To view the source code for org.apache.lucene.analysis CharArraySet EMPTY_SET.

Click Source Link

Document

An empty CharArraySet .

Usage

From source file:io.anserini.qa.passage.IdfPassageScorer.java

License:Apache License

@Override
public JSONObject getTermIdfJSON(List<String> sentList) {
    //    EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
    ClassicSimilarity similarity = new ClassicSimilarity();

    for (String sent : sentList) {
        String[] thisSentence = sent.trim().split("\\s+");

        for (String term : thisSentence) {
            try {
                TermQuery q = (TermQuery) qp.parse(term);
                Term t = q.getTerm();//from w  w w.  j  a  v  a 2 s.c om

                double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
                termIdfMap.put(term, String.valueOf(termIDF));
            } catch (Exception e) {
                continue;
            }
        }
    }
    return new JSONObject(termIdfMap);
}

From source file:io.anserini.search.SearchCollection.java

License:Apache License

public SearchCollection(SearchArgs args) throws IOException {
    this.args = args;
    Path indexPath = Paths.get(args.index);

    if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
        throw new IllegalArgumentException(args.index + " does not exist or is not a directory.");
    }/*from   w w w  . j av  a 2 s  .c  om*/

    LOG.info("Reading index at " + args.index);
    this.reader = DirectoryReader.open(FSDirectory.open(indexPath));

    // Figure out which scoring model to use.
    if (args.ql) {
        LOG.info("Using QL scoring model");
        this.similarity = new LMDirichletSimilarity(args.mu);
    } else if (args.bm25) {
        LOG.info("Using BM25 scoring model");
        this.similarity = new BM25Similarity(args.k1, args.b);
    } else if (args.f2log) {
        LOG.info("Using F2Log scoring model");
        this.similarity = new F2LogSimilarity(args.f2log_s);
    } else {
        throw new IllegalArgumentException("Error: Must specify scoring model!");
    }

    // Are we searching tweets?
    if (args.searchtweets) {
        analyzer = new TweetAnalyzer();
    } else {
        analyzer = args.keepstop ? new EnglishAnalyzer(CharArraySet.EMPTY_SET) : new EnglishAnalyzer();
    }

    isRerank = args.rm3 || args.axiom;

    // Set up the ranking cascade.
    cascade = new RerankerCascade();
    if (args.rm3) {
        cascade.add(new Rm3Reranker(analyzer, FIELD_BODY, args));
    } else if (args.axiom) {
        cascade.add(new AxiomReranker(FIELD_BODY, args));
    }

    cascade.add(new ScoreTiesAdjusterReranker());
}

From source file:org.apache.carbondata.datamap.lucene.LuceneDataMapWriter.java

License:Apache License

/**
 * Start of new blocklet notification./*from  ww w  .ja va 2s .  c o  m*/
 */
public void onBlockletStart(int blockletId) throws IOException {
    if (null == analyzer) {
        if (CarbonProperties.getInstance()
                .getProperty(CarbonCommonConstants.CARBON_LUCENE_INDEX_STOP_WORDS,
                        CarbonCommonConstants.CARBON_LUCENE_INDEX_STOP_WORDS_DEFAULT)
                .equalsIgnoreCase("true")) {
            analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET);
        } else {
            analyzer = new StandardAnalyzer();
        }
    }
    // save index data into ram, write into disk after one page finished
    ramDir = new RAMDirectory();
    ramIndexWriter = new IndexWriter(ramDir, new IndexWriterConfig(analyzer));

    if (indexWriter != null) {
        return;
    }
    // get index path, put index data into segment's path
    String dataMapPath;
    if (storeBlockletWise) {
        dataMapPath = this.dataMapPath + File.separator + blockletId;
    } else {
        dataMapPath = this.dataMapPath;
    }
    Path indexPath = FileFactory.getPath(dataMapPath);
    FileSystem fs = FileFactory.getFileSystem(indexPath);

    // if index path not exists, create it
    if (!fs.exists(indexPath)) {
        if (!fs.mkdirs(indexPath)) {
            throw new IOException("Failed to create directory " + dataMapPath);
        }
    }

    // the indexWriter closes the FileSystem on closing the writer, so for a new configuration
    // and disable the cache for the index writer, it will be closed on closing the writer
    Configuration conf = FileFactory.getConfiguration();
    conf.set("fs.hdfs.impl.disable.cache", "true");

    // create a index writer
    Directory indexDir = new HdfsDirectory(indexPath, conf);

    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
    if (CarbonProperties.getInstance()
            .getProperty(CarbonCommonConstants.CARBON_LUCENE_COMPRESSION_MODE,
                    CarbonCommonConstants.CARBON_LUCENE_COMPRESSION_MODE_DEFAULT)
            .equalsIgnoreCase(CarbonCommonConstants.CARBON_LUCENE_COMPRESSION_MODE_DEFAULT)) {
        indexWriterConfig.setCodec(speedCodec);
    } else {
        indexWriterConfig.setCodec(compressionCodec);
    }

    indexWriter = new IndexWriter(indexDir, indexWriterConfig);
}

From source file:org.elasticsearch.analysis.common.ArabicAnalyzerProvider.java

License:Apache License

ArabicAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    arabicAnalyzer = new ArabicAnalyzer(
            Analysis.parseStopWords(env, settings, ArabicAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    arabicAnalyzer.setVersion(version);/* w ww  .j a v a2  s.co  m*/
}

From source file:org.elasticsearch.analysis.common.ArmenianAnalyzerProvider.java

License:Apache License

ArmenianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new ArmenianAnalyzer(
            Analysis.parseStopWords(env, settings, ArmenianAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);//from w  w  w. ja v  a 2 s .co m
}

From source file:org.elasticsearch.analysis.common.BasqueAnalyzerProvider.java

License:Apache License

BasqueAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new BasqueAnalyzer(Analysis.parseStopWords(env, settings, BasqueAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);//from   www. jav a 2 s .c  om
}

From source file:org.elasticsearch.analysis.common.BengaliAnalyzerProvider.java

License:Apache License

BengaliAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new BengaliAnalyzer(Analysis.parseStopWords(env, settings, BengaliAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);/*from w ww.  j  a  va2 s . c  o m*/
}

From source file:org.elasticsearch.analysis.common.BrazilianAnalyzerProvider.java

License:Apache License

BrazilianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new BrazilianAnalyzer(
            Analysis.parseStopWords(env, settings, BrazilianAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);//from www .  j  a v  a2  s. c  om
}

From source file:org.elasticsearch.analysis.common.BrazilianStemTokenFilterFactory.java

License:Apache License

BrazilianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
        Settings settings) {/*from   w  w  w. j  a va  2  s .  co  m*/
    super(indexSettings, name, settings);
    this.exclusions = Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET);
}

From source file:org.elasticsearch.analysis.common.BulgarianAnalyzerProvider.java

License:Apache License

BulgarianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new BulgarianAnalyzer(
            Analysis.parseStopWords(env, settings, BulgarianAnalyzer.getDefaultStopSet()),
            Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);/*from   w  ww.j a v a  2s.  c o m*/
}