List of usage examples for org.apache.lucene.analysis.en EnglishAnalyzer EnglishAnalyzer
public EnglishAnalyzer()
From source file:apps.LuceneIndexer.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); options.addOption("i", null, true, "input file"); options.addOption("o", null, true, "output directory"); options.addOption("r", null, true, "optional output TREC-format QREL file"); options.addOption("bm25_b", null, true, "BM25 parameter: b"); options.addOption("bm25_k1", null, true, "BM25 parameter: k1"); options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity"); Joiner commaJoin = Joiner.on(','); Joiner spaceJoin = Joiner.on(' '); options.addOption("source_type", null, true, "document source type: " + commaJoin.join(SourceFactory.getDocSourceList())); // If you increase this value, you may need to modify the following line in *.sh file // export MAVEN_OPTS="-Xms8192m -server" double ramBufferSizeMB = 1024 * 8; // 8 GB CommandLineParser parser = new org.apache.commons.cli.GnuParser(); IndexWriter indexWriter = null;//from w w w . j a va2 s. com BufferedWriter qrelWriter = null; int docNum = 0; try { CommandLine cmd = parser.parse(options, args); String inputFileName = null, outputDirName = null, qrelFileName = null; if (cmd.hasOption("i")) { inputFileName = cmd.getOptionValue("i"); } else { Usage("Specify 'input file'", options); } if (cmd.hasOption("o")) { outputDirName = cmd.getOptionValue("o"); } else { Usage("Specify 'index directory'", options); } if (cmd.hasOption("r")) { qrelFileName = cmd.getOptionValue("r"); } String sourceName = cmd.getOptionValue("source_type"); if (sourceName == null) Usage("Specify document source type", options); if (qrelFileName != null) qrelWriter = new BufferedWriter(new FileWriter(qrelFileName)); File outputDir = new File(outputDirName); if (!outputDir.exists()) { if (!outputDir.mkdirs()) { System.out.println("couldn't create " + outputDir.getAbsolutePath()); System.exit(1); } } if (!outputDir.isDirectory()) { System.out.println(outputDir.getAbsolutePath() + " is not a directory!"); System.exit(1); } if (!outputDir.canWrite()) { System.out.println("Can't write to " + outputDir.getAbsolutePath()); System.exit(1); } boolean useFixedBM25 = cmd.hasOption("bm25fixed"); float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT; if (cmd.hasOption("bm25_k1")) { try { bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_k1'", options); } } if (cmd.hasOption("bm25_b")) { try { bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_b'", options); } } EnglishAnalyzer analyzer = new EnglishAnalyzer(); FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName)); IndexWriterConfig indexConf = new IndexWriterConfig(analyzer); /* OpenMode.CREATE creates a new index or overwrites an existing one. https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE */ indexConf.setOpenMode(OpenMode.CREATE); indexConf.setRAMBufferSizeMB(ramBufferSizeMB); System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b)); if (useFixedBM25) { System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b)); indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b)); } else { System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b)); indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b)); } indexWriter = new IndexWriter(indexDir, indexConf); DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName); DocumentEntry inpDoc = null; TextCleaner textCleaner = new TextCleaner(null); while ((inpDoc = inpDocSource.next()) != null) { ++docNum; Document luceneDoc = new Document(); ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText); String cleanText = spaceJoin.join(cleanedToks); // System.out.println(inpDoc.mDocId); // System.out.println(cleanText); // System.out.println("=============================="); luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES)); luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES)); indexWriter.addDocument(luceneDoc); if (inpDoc.mIsRel != null && qrelWriter != null) { saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0); } if (docNum % 1000 == 0) System.out.println(String.format("Indexed %d documents", docNum)); } } catch (ParseException e) { e.printStackTrace(); Usage("Cannot parse arguments" + e, options); } catch (Exception e) { System.err.println("Terminating due to an exception: " + e); System.exit(1); } finally { System.out.println(String.format("Indexed %d documents", docNum)); try { if (null != indexWriter) indexWriter.close(); if (null != qrelWriter) qrelWriter.close(); } catch (IOException e) { System.err.println("IO exception: " + e); e.printStackTrace(); } } }
From source file:apps.LuceneQuery.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); options.addOption("d", null, true, "index directory"); options.addOption("i", null, true, "input file"); options.addOption("s", null, true, "stop word file"); options.addOption("n", null, true, "max # of results"); options.addOption("o", null, true, "a TREC-style output file"); options.addOption("r", null, true, "an optional QREL file, if specified," + "we save results only for queries for which we find at least one relevant entry."); options.addOption("prob", null, true, "question sampling probability"); options.addOption("max_query_qty", null, true, "a maximum number of queries to run"); options.addOption("bm25_b", null, true, "BM25 parameter: b"); options.addOption("bm25_k1", null, true, "BM25 parameter: k1"); options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity"); options.addOption("seed", null, true, "random seed"); Joiner commaJoin = Joiner.on(','); Joiner spaceJoin = Joiner.on(' '); options.addOption("source_type", null, true, "query source type: " + commaJoin.join(SourceFactory.getQuerySourceList())); CommandLineParser parser = new org.apache.commons.cli.GnuParser(); QrelReader qrels = null;/*from w w w. j av a 2 s. c o m*/ try { CommandLine cmd = parser.parse(options, args); String indexDir = null; if (cmd.hasOption("d")) { indexDir = cmd.getOptionValue("d"); } else { Usage("Specify 'index directory'", options); } String inputFileName = null; if (cmd.hasOption("i")) { inputFileName = cmd.getOptionValue("i"); } else { Usage("Specify 'input file'", options); } DictNoComments stopWords = null; if (cmd.hasOption("s")) { String stopWordFileName = cmd.getOptionValue("s"); stopWords = new DictNoComments(new File(stopWordFileName), true /* lowercasing */); System.out.println("Using the stopword file: " + stopWordFileName); } String sourceName = cmd.getOptionValue("source_type"); if (sourceName == null) Usage("Specify document source type", options); int numRet = 100; if (cmd.hasOption("n")) { numRet = Integer.parseInt(cmd.getOptionValue("n")); System.out.println("Retrieving at most " + numRet + " candidate entries."); } String trecOutFileName = null; if (cmd.hasOption("o")) { trecOutFileName = cmd.getOptionValue("o"); } else { Usage("Specify 'a TREC-style output file'", options); } double fProb = 1.0f; if (cmd.hasOption("prob")) { try { fProb = Double.parseDouble(cmd.getOptionValue("prob")); } catch (NumberFormatException e) { Usage("Wrong format for 'question sampling probability'", options); } } if (fProb <= 0 || fProb > 1) { Usage("Question sampling probability should be >0 and <=1", options); } System.out.println("Sample the following fraction of questions: " + fProb); float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT; if (cmd.hasOption("bm25_k1")) { try { bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_k1'", options); } } if (cmd.hasOption("bm25_b")) { try { bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_b'", options); } } long seed = 0; String tmpl = cmd.getOptionValue("seed"); if (tmpl != null) seed = Long.parseLong(tmpl); System.out.println("Using seed: " + seed); Random randGen = new Random(seed); System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b)); boolean useFixedBM25 = cmd.hasOption("bm25fixed"); EnglishAnalyzer analyzer = new EnglishAnalyzer(); Similarity similarity = null; if (useFixedBM25) { System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b)); similarity = new BM25SimilarityFix(bm25_k1, bm25_b); } else { System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b)); similarity = new BM25Similarity(bm25_k1, bm25_b); } int maxQueryQty = Integer.MAX_VALUE; if (cmd.hasOption("max_query_qty")) { try { maxQueryQty = Integer.parseInt(cmd.getOptionValue("max_query_qty")); } catch (NumberFormatException e) { Usage("Wrong format for 'max_query_qty'", options); } } System.out.println(String.format("Executing at most %d queries", maxQueryQty)); if (cmd.hasOption("r")) { String qrelFile = cmd.getOptionValue("r"); System.out.println("Using the qrel file: '" + qrelFile + "', queries not returning a relevant entry will be ignored."); qrels = new QrelReader(qrelFile); } System.out.println(String.format("Using indexing directory %s", indexDir)); LuceneCandidateProvider candProvider = new LuceneCandidateProvider(indexDir, analyzer, similarity); TextCleaner textCleaner = new TextCleaner(stopWords); QuerySource inpQuerySource = SourceFactory.createQuerySource(sourceName, inputFileName); QueryEntry inpQuery = null; BufferedWriter trecOutFile = new BufferedWriter(new FileWriter(new File(trecOutFileName))); int questNum = 0, questQty = 0; long totalTimeMS = 0; while ((inpQuery = inpQuerySource.next()) != null) { if (questQty >= maxQueryQty) break; ++questNum; String queryID = inpQuery.mQueryId; if (randGen.nextDouble() <= fProb) { ++questQty; String tokQuery = spaceJoin.join(textCleaner.cleanUp(inpQuery.mQueryText)); String query = TextCleaner.luceneSafeCleanUp(tokQuery).trim(); ResEntry[] results = null; if (query.isEmpty()) { results = new ResEntry[0]; System.out.println(String.format("WARNING, empty query id = '%s'", inpQuery.mQueryId)); } else { try { long start = System.currentTimeMillis(); results = candProvider.getCandidates(questNum, query, numRet); long end = System.currentTimeMillis(); long searchTimeMS = end - start; totalTimeMS += searchTimeMS; System.out.println(String.format( "Obtained results for the query # %d (answered %d queries), queryID %s the search took %d ms, we asked for max %d entries got %d", questNum, questQty, queryID, searchTimeMS, numRet, results.length)); } catch (ParseException e) { e.printStackTrace(); System.err.println( "Error parsing query: " + query + " orig question is :" + inpQuery.mQueryText); System.exit(1); } } boolean bSave = true; if (qrels != null) { boolean bOk = false; for (ResEntry r : results) { String label = qrels.get(queryID, r.mDocId); if (candProvider.isRelevLabel(label, 1)) { bOk = true; break; } } if (!bOk) bSave = false; } // System.out.println(String.format("Ranking results the query # %d queryId='%s' save results? %b", // questNum, queryID, bSave)); if (bSave) { saveTrecResults(queryID, results, trecOutFile, TREC_RUN, numRet); } } if (questNum % 1000 == 0) System.out.println(String.format("Proccessed %d questions", questNum)); } System.out.println(String.format("Proccessed %d questions, the search took %f MS on average", questQty, (float) totalTimeMS / questQty)); trecOutFile.close(); } catch (ParseException e) { e.printStackTrace(); Usage("Cannot parse arguments: " + e, options); } catch (Exception e) { System.err.println("Terminating due to an exception: " + e); System.exit(1); } }
From source file:com.ibm.watson.developer_cloud.professor_languo.pipeline.primary_search.SingletonAnalyzer.java
License:Open Source License
public static Analyzer generateAnalyzer(String type) { Analyzer analyzer = null;/*from w ww . j a v a 2 s.co m*/ switch (type) { case PrimarySearchConstants.STANDARD_ANALYZER: analyzer = new StandardAnalyzer(); break; case PrimarySearchConstants.ENGLISH_ANALYZER: analyzer = new EnglishAnalyzer(); break; default: analyzer = null; throw new IllegalArgumentException(Messages.getString("RetrieveAndRank.QUERY_GENERATOR_NOT_FOUND")); //$NON-NLS-1$ } INSTANCE = analyzer; return analyzer; }
From source file:com.stratio.cassandra.lucene.schema.SchemaAnalyzerTest.java
License:Apache License
@Test(expected = IllegalArgumentException.class) public void testGetAnalyzerNull() { Analyzer defaultAnalyzer = new EnglishAnalyzer(); Map<String, Mapper> mappers = new HashMap<>(); Map<String, Analyzer> analyzers = new HashMap<>(); SchemaAnalyzer schemaAnalyzer = new SchemaAnalyzer(defaultAnalyzer, analyzers, mappers); schemaAnalyzer.getAnalyzer(null);//from ww w . ja v a 2 s.c o m }
From source file:com.stratio.cassandra.lucene.schema.SchemaAnalyzerTest.java
License:Apache License
@Test(expected = IllegalArgumentException.class) public void testGetAnalyzerBlank() { Analyzer defaultAnalyzer = new EnglishAnalyzer(); Map<String, Mapper> mappers = new HashMap<>(); Map<String, Analyzer> analyzers = new HashMap<>(); SchemaAnalyzer schemaAnalyzer = new SchemaAnalyzer(defaultAnalyzer, analyzers, mappers); schemaAnalyzer.getAnalyzer(" \t"); }
From source file:com.stratio.cassandra.lucene.schema.SchemaAnalyzerTest.java
License:Apache License
@Test public void testGetAnalyzerNotExistent() { Analyzer defaultAnalyzer = new EnglishAnalyzer(); Map<String, Mapper> mappers = new HashMap<>(); Map<String, Analyzer> analyzers = new HashMap<>(); SchemaAnalyzer schemaAnalyzer = new SchemaAnalyzer(defaultAnalyzer, analyzers, mappers); schemaAnalyzer.getAnalyzer("failing"); }
From source file:com.stratio.cassandra.lucene.schema.SchemaAnalyzerTest.java
License:Apache License
@Test public void testStaticGetAnalyzer() { Analyzer expectedAnalyzer = new EnglishAnalyzer(); Map<String, Analyzer> analyzers = new HashMap<>(); analyzers.put("analyzer", expectedAnalyzer); Analyzer actualAnalyzer = SchemaAnalyzer.getAnalyzer(analyzers, "analyzer"); assertEquals("Static get analyzer is failing", expectedAnalyzer, actualAnalyzer); }
From source file:com.stratio.cassandra.lucene.schema.SchemaAnalyzerTest.java
License:Apache License
@Test public void testToString() { Analyzer defaultAnalyzer = new EnglishAnalyzer(); Map<String, Mapper> mappers = new HashMap<>(); mappers.put("mapper", stringMapper().build("mapper")); Map<String, Analyzer> analyzers = new HashMap<>(); SchemaAnalyzer schemaAnalyzer = new SchemaAnalyzer(defaultAnalyzer, analyzers, mappers); assertNotNull("Expected not null schema", schemaAnalyzer.toString()); }
From source file:com.stratio.cassandra.lucene.schema.SchemaTest.java
License:Apache License
@Test public void testGetDefaultAnalyzer() { Map<String, Mapper> mappers = new HashMap<>(); Schema schema = new Schema(new EnglishAnalyzer(), mappers, null); Analyzer analyzer = schema.getDefaultAnalyzer(); assertEquals(EnglishAnalyzer.class, analyzer.getClass()); schema.close();/*from ww w .j a v a2 s . c om*/ }
From source file:com.stratio.cassandra.lucene.schema.SchemaTest.java
License:Apache License
@Test(expected = IllegalArgumentException.class) public void testGetAnalyzerNotExistent() { Map<String, Mapper> mappers = new HashMap<>(); Schema schema = new Schema(new EnglishAnalyzer(), mappers, null); schema.getAnalyzer("custom"); schema.close();//from w w w . j a v a 2 s. co m }