Example usage for org.apache.lucene.analysis.en EnglishAnalyzer EnglishAnalyzer

List of usage examples for org.apache.lucene.analysis.en EnglishAnalyzer EnglishAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.en EnglishAnalyzer EnglishAnalyzer.

Prototype

public EnglishAnalyzer() 

Source Link

Document

Builds an analyzer with the default stop words: #getDefaultStopSet .

Usage

From source file:apps.LuceneIndexer.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();

    options.addOption("i", null, true, "input file");
    options.addOption("o", null, true, "output directory");
    options.addOption("r", null, true, "optional output TREC-format QREL file");

    options.addOption("bm25_b", null, true, "BM25 parameter: b");
    options.addOption("bm25_k1", null, true, "BM25 parameter: k1");
    options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity");

    Joiner commaJoin = Joiner.on(',');
    Joiner spaceJoin = Joiner.on(' ');

    options.addOption("source_type", null, true,
            "document source type: " + commaJoin.join(SourceFactory.getDocSourceList()));

    // If you increase this value, you may need to modify the following line in *.sh file
    // export MAVEN_OPTS="-Xms8192m -server"
    double ramBufferSizeMB = 1024 * 8; // 8 GB

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    IndexWriter indexWriter = null;//from   w w w . j  a va2  s. com
    BufferedWriter qrelWriter = null;

    int docNum = 0;

    try {
        CommandLine cmd = parser.parse(options, args);

        String inputFileName = null, outputDirName = null, qrelFileName = null;

        if (cmd.hasOption("i")) {
            inputFileName = cmd.getOptionValue("i");
        } else {
            Usage("Specify 'input file'", options);
        }

        if (cmd.hasOption("o")) {
            outputDirName = cmd.getOptionValue("o");
        } else {
            Usage("Specify 'index directory'", options);
        }

        if (cmd.hasOption("r")) {
            qrelFileName = cmd.getOptionValue("r");
        }

        String sourceName = cmd.getOptionValue("source_type");

        if (sourceName == null)
            Usage("Specify document source type", options);

        if (qrelFileName != null)
            qrelWriter = new BufferedWriter(new FileWriter(qrelFileName));

        File outputDir = new File(outputDirName);
        if (!outputDir.exists()) {
            if (!outputDir.mkdirs()) {
                System.out.println("couldn't create " + outputDir.getAbsolutePath());
                System.exit(1);
            }
        }
        if (!outputDir.isDirectory()) {
            System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
            System.exit(1);
        }
        if (!outputDir.canWrite()) {
            System.out.println("Can't write to " + outputDir.getAbsolutePath());
            System.exit(1);
        }

        boolean useFixedBM25 = cmd.hasOption("bm25fixed");

        float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT;

        if (cmd.hasOption("bm25_k1")) {
            try {
                bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_k1'", options);
            }
        }

        if (cmd.hasOption("bm25_b")) {
            try {
                bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_b'", options);
            }
        }

        EnglishAnalyzer analyzer = new EnglishAnalyzer();
        FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName));
        IndexWriterConfig indexConf = new IndexWriterConfig(analyzer);

        /*
            OpenMode.CREATE creates a new index or overwrites an existing one.
            https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE
        */
        indexConf.setOpenMode(OpenMode.CREATE);
        indexConf.setRAMBufferSizeMB(ramBufferSizeMB);

        System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b));

        if (useFixedBM25) {
            System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b));
        } else {
            System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b));
        }

        indexWriter = new IndexWriter(indexDir, indexConf);

        DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName);
        DocumentEntry inpDoc = null;
        TextCleaner textCleaner = new TextCleaner(null);

        while ((inpDoc = inpDocSource.next()) != null) {
            ++docNum;

            Document luceneDoc = new Document();
            ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText);
            String cleanText = spaceJoin.join(cleanedToks);

            //        System.out.println(inpDoc.mDocId);
            //        System.out.println(cleanText);
            //        System.out.println("==============================");

            luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES));
            luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES));
            indexWriter.addDocument(luceneDoc);

            if (inpDoc.mIsRel != null && qrelWriter != null) {
                saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0);
            }
            if (docNum % 1000 == 0)
                System.out.println(String.format("Indexed %d documents", docNum));

        }

    } catch (ParseException e) {
        e.printStackTrace();
        Usage("Cannot parse arguments" + e, options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    } finally {
        System.out.println(String.format("Indexed %d documents", docNum));

        try {
            if (null != indexWriter)
                indexWriter.close();
            if (null != qrelWriter)
                qrelWriter.close();
        } catch (IOException e) {
            System.err.println("IO exception: " + e);
            e.printStackTrace();
        }
    }
}

From source file:apps.LuceneQuery.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();

    options.addOption("d", null, true, "index directory");
    options.addOption("i", null, true, "input file");
    options.addOption("s", null, true, "stop word file");
    options.addOption("n", null, true, "max # of results");
    options.addOption("o", null, true, "a TREC-style output file");
    options.addOption("r", null, true, "an optional QREL file, if specified,"
            + "we save results only for queries for which we find at least one relevant entry.");

    options.addOption("prob", null, true, "question sampling probability");
    options.addOption("max_query_qty", null, true, "a maximum number of queries to run");
    options.addOption("bm25_b", null, true, "BM25 parameter: b");
    options.addOption("bm25_k1", null, true, "BM25 parameter: k1");
    options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity");

    options.addOption("seed", null, true, "random seed");

    Joiner commaJoin = Joiner.on(',');
    Joiner spaceJoin = Joiner.on(' ');

    options.addOption("source_type", null, true,
            "query source type: " + commaJoin.join(SourceFactory.getQuerySourceList()));

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    QrelReader qrels = null;/*from   w  w w. j  av a 2 s.  c  o m*/

    try {

        CommandLine cmd = parser.parse(options, args);

        String indexDir = null;

        if (cmd.hasOption("d")) {
            indexDir = cmd.getOptionValue("d");
        } else {
            Usage("Specify 'index directory'", options);
        }

        String inputFileName = null;

        if (cmd.hasOption("i")) {
            inputFileName = cmd.getOptionValue("i");
        } else {
            Usage("Specify 'input file'", options);
        }

        DictNoComments stopWords = null;

        if (cmd.hasOption("s")) {
            String stopWordFileName = cmd.getOptionValue("s");
            stopWords = new DictNoComments(new File(stopWordFileName), true /* lowercasing */);
            System.out.println("Using the stopword file: " + stopWordFileName);
        }

        String sourceName = cmd.getOptionValue("source_type");

        if (sourceName == null)
            Usage("Specify document source type", options);

        int numRet = 100;

        if (cmd.hasOption("n")) {
            numRet = Integer.parseInt(cmd.getOptionValue("n"));
            System.out.println("Retrieving at most " + numRet + " candidate entries.");
        }

        String trecOutFileName = null;

        if (cmd.hasOption("o")) {
            trecOutFileName = cmd.getOptionValue("o");
        } else {
            Usage("Specify 'a TREC-style output file'", options);
        }

        double fProb = 1.0f;

        if (cmd.hasOption("prob")) {
            try {
                fProb = Double.parseDouble(cmd.getOptionValue("prob"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'question sampling probability'", options);
            }
        }

        if (fProb <= 0 || fProb > 1) {
            Usage("Question sampling probability should be >0 and <=1", options);
        }

        System.out.println("Sample the following fraction of questions: " + fProb);

        float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT;

        if (cmd.hasOption("bm25_k1")) {
            try {
                bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_k1'", options);
            }
        }

        if (cmd.hasOption("bm25_b")) {
            try {
                bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_b'", options);
            }
        }

        long seed = 0;

        String tmpl = cmd.getOptionValue("seed");

        if (tmpl != null)
            seed = Long.parseLong(tmpl);

        System.out.println("Using seed: " + seed);

        Random randGen = new Random(seed);

        System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b));

        boolean useFixedBM25 = cmd.hasOption("bm25fixed");

        EnglishAnalyzer analyzer = new EnglishAnalyzer();
        Similarity similarity = null;

        if (useFixedBM25) {
            System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b));
            similarity = new BM25SimilarityFix(bm25_k1, bm25_b);
        } else {
            System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b));
            similarity = new BM25Similarity(bm25_k1, bm25_b);
        }

        int maxQueryQty = Integer.MAX_VALUE;

        if (cmd.hasOption("max_query_qty")) {
            try {
                maxQueryQty = Integer.parseInt(cmd.getOptionValue("max_query_qty"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'max_query_qty'", options);
            }
        }

        System.out.println(String.format("Executing at most %d queries", maxQueryQty));

        if (cmd.hasOption("r")) {
            String qrelFile = cmd.getOptionValue("r");
            System.out.println("Using the qrel file: '" + qrelFile
                    + "', queries not returning a relevant entry will be ignored.");
            qrels = new QrelReader(qrelFile);
        }

        System.out.println(String.format("Using indexing directory %s", indexDir));

        LuceneCandidateProvider candProvider = new LuceneCandidateProvider(indexDir, analyzer, similarity);
        TextCleaner textCleaner = new TextCleaner(stopWords);

        QuerySource inpQuerySource = SourceFactory.createQuerySource(sourceName, inputFileName);
        QueryEntry inpQuery = null;

        BufferedWriter trecOutFile = new BufferedWriter(new FileWriter(new File(trecOutFileName)));

        int questNum = 0, questQty = 0;

        long totalTimeMS = 0;

        while ((inpQuery = inpQuerySource.next()) != null) {
            if (questQty >= maxQueryQty)
                break;
            ++questNum;

            String queryID = inpQuery.mQueryId;

            if (randGen.nextDouble() <= fProb) {
                ++questQty;

                String tokQuery = spaceJoin.join(textCleaner.cleanUp(inpQuery.mQueryText));
                String query = TextCleaner.luceneSafeCleanUp(tokQuery).trim();

                ResEntry[] results = null;

                if (query.isEmpty()) {
                    results = new ResEntry[0];
                    System.out.println(String.format("WARNING, empty query id = '%s'", inpQuery.mQueryId));
                } else {

                    try {
                        long start = System.currentTimeMillis();

                        results = candProvider.getCandidates(questNum, query, numRet);

                        long end = System.currentTimeMillis();
                        long searchTimeMS = end - start;
                        totalTimeMS += searchTimeMS;

                        System.out.println(String.format(
                                "Obtained results for the query # %d (answered %d queries), queryID %s the search took %d ms, we asked for max %d entries got %d",
                                questNum, questQty, queryID, searchTimeMS, numRet, results.length));

                    } catch (ParseException e) {
                        e.printStackTrace();
                        System.err.println(
                                "Error parsing query: " + query + " orig question is :" + inpQuery.mQueryText);
                        System.exit(1);
                    }
                }

                boolean bSave = true;

                if (qrels != null) {
                    boolean bOk = false;
                    for (ResEntry r : results) {
                        String label = qrels.get(queryID, r.mDocId);
                        if (candProvider.isRelevLabel(label, 1)) {
                            bOk = true;
                            break;
                        }
                    }
                    if (!bOk)
                        bSave = false;
                }

                //            System.out.println(String.format("Ranking results the query # %d queryId='%s' save results? %b", 
                //                                              questNum, queryID, bSave));          
                if (bSave) {
                    saveTrecResults(queryID, results, trecOutFile, TREC_RUN, numRet);
                }
            }

            if (questNum % 1000 == 0)
                System.out.println(String.format("Proccessed %d questions", questNum));

        }

        System.out.println(String.format("Proccessed %d questions, the search took %f MS on average", questQty,
                (float) totalTimeMS / questQty));

        trecOutFile.close();

    } catch (ParseException e) {
        e.printStackTrace();
        Usage("Cannot parse arguments: " + e, options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    }
}

From source file:com.ibm.watson.developer_cloud.professor_languo.pipeline.primary_search.SingletonAnalyzer.java

License:Open Source License

public static Analyzer generateAnalyzer(String type) {
    Analyzer analyzer = null;/*from w  ww . j  a  v a 2 s.co m*/
    switch (type) {
    case PrimarySearchConstants.STANDARD_ANALYZER:
        analyzer = new StandardAnalyzer();
        break;
    case PrimarySearchConstants.ENGLISH_ANALYZER:
        analyzer = new EnglishAnalyzer();
        break;
    default:
        analyzer = null;
        throw new IllegalArgumentException(Messages.getString("RetrieveAndRank.QUERY_GENERATOR_NOT_FOUND")); //$NON-NLS-1$
    }
    INSTANCE = analyzer;
    return analyzer;
}

From source file:com.stratio.cassandra.lucene.schema.SchemaAnalyzerTest.java

License:Apache License

@Test(expected = IllegalArgumentException.class)
public void testGetAnalyzerNull() {
    Analyzer defaultAnalyzer = new EnglishAnalyzer();
    Map<String, Mapper> mappers = new HashMap<>();
    Map<String, Analyzer> analyzers = new HashMap<>();
    SchemaAnalyzer schemaAnalyzer = new SchemaAnalyzer(defaultAnalyzer, analyzers, mappers);
    schemaAnalyzer.getAnalyzer(null);//from  ww  w .  ja v  a 2 s.c  o  m
}

From source file:com.stratio.cassandra.lucene.schema.SchemaAnalyzerTest.java

License:Apache License

@Test(expected = IllegalArgumentException.class)
public void testGetAnalyzerBlank() {
    Analyzer defaultAnalyzer = new EnglishAnalyzer();
    Map<String, Mapper> mappers = new HashMap<>();
    Map<String, Analyzer> analyzers = new HashMap<>();
    SchemaAnalyzer schemaAnalyzer = new SchemaAnalyzer(defaultAnalyzer, analyzers, mappers);
    schemaAnalyzer.getAnalyzer(" \t");
}

From source file:com.stratio.cassandra.lucene.schema.SchemaAnalyzerTest.java

License:Apache License

@Test
public void testGetAnalyzerNotExistent() {
    Analyzer defaultAnalyzer = new EnglishAnalyzer();
    Map<String, Mapper> mappers = new HashMap<>();
    Map<String, Analyzer> analyzers = new HashMap<>();
    SchemaAnalyzer schemaAnalyzer = new SchemaAnalyzer(defaultAnalyzer, analyzers, mappers);
    schemaAnalyzer.getAnalyzer("failing");
}

From source file:com.stratio.cassandra.lucene.schema.SchemaAnalyzerTest.java

License:Apache License

@Test
public void testStaticGetAnalyzer() {
    Analyzer expectedAnalyzer = new EnglishAnalyzer();
    Map<String, Analyzer> analyzers = new HashMap<>();
    analyzers.put("analyzer", expectedAnalyzer);
    Analyzer actualAnalyzer = SchemaAnalyzer.getAnalyzer(analyzers, "analyzer");
    assertEquals("Static get analyzer is failing", expectedAnalyzer, actualAnalyzer);
}

From source file:com.stratio.cassandra.lucene.schema.SchemaAnalyzerTest.java

License:Apache License

@Test
public void testToString() {
    Analyzer defaultAnalyzer = new EnglishAnalyzer();
    Map<String, Mapper> mappers = new HashMap<>();
    mappers.put("mapper", stringMapper().build("mapper"));
    Map<String, Analyzer> analyzers = new HashMap<>();
    SchemaAnalyzer schemaAnalyzer = new SchemaAnalyzer(defaultAnalyzer, analyzers, mappers);
    assertNotNull("Expected not null schema", schemaAnalyzer.toString());
}

From source file:com.stratio.cassandra.lucene.schema.SchemaTest.java

License:Apache License

@Test
public void testGetDefaultAnalyzer() {
    Map<String, Mapper> mappers = new HashMap<>();
    Schema schema = new Schema(new EnglishAnalyzer(), mappers, null);
    Analyzer analyzer = schema.getDefaultAnalyzer();
    assertEquals(EnglishAnalyzer.class, analyzer.getClass());
    schema.close();/*from  ww w .j  a v  a2  s  .  c  om*/
}

From source file:com.stratio.cassandra.lucene.schema.SchemaTest.java

License:Apache License

@Test(expected = IllegalArgumentException.class)
public void testGetAnalyzerNotExistent() {
    Map<String, Mapper> mappers = new HashMap<>();
    Schema schema = new Schema(new EnglishAnalyzer(), mappers, null);
    schema.getAnalyzer("custom");
    schema.close();//from  w  w w  .  j a v  a 2 s. co m
}