List of usage examples for org.apache.lucene.analysis.en EnglishAnalyzer EnglishAnalyzer
public EnglishAnalyzer(CharArraySet stopwords)
From source file:IrqaQuery.java
License:Apache License
public static void makeIndexWriter(String indexPath, String stopPath, String sim) throws IOException { System.out.println("[makeIndexWriter] started"); System.out.println("[makeIndexWriter]" + stopPath); Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stopPath))); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (sim.equals("TFIDF")) iwc.setSimilarity(new ClassicSimilarity()); else if (sim.equals("BM25")) iwc.setSimilarity(new BM25Similarity()); else/*from w ww .ja v a 2 s .com*/ iwc.setSimilarity(new BM25Similarity()); writer = new IndexWriter(dir, iwc); }
From source file:IrqaQuery.java
License:Apache License
public static List<Document> query(String index, String stoppath, String question, int numResult, String sim) throws Exception { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stoppath))); if (sim.equals("TFIDF")) searcher.setSimilarity(new ClassicSimilarity()); else if (sim.equals("BM25")) searcher.setSimilarity(new BM25Similarity()); else/*from w w w . j av a 2s . co m*/ searcher.setSimilarity(new BM25Similarity()); String field = "contents"; QueryParser parser = new QueryParser(field, analyzer); Query query = parser.parse(parser.escape(question)); TopDocs results = searcher.search(query, numResult); ScoreDoc[] hits = results.scoreDocs; List<Document> docs = new ArrayList<Document>(); int numTotalHits = results.totalHits; // System.out.println(numTotalHits + " total matching documents"); int end = Math.min(numTotalHits, numResult); String searchResult = ""; // System.out.println("Only results 1 - " + hits.length); for (int i = 0; i < end; i++) { Document doc = searcher.doc(hits[i].doc); docs.add(doc); } return docs; }
From source file:luceneInterface.java
License:Apache License
public static List<Document> query(String index, String stoppath, String question, int numResult, String sim) throws Exception { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stoppath))); if (sim.equals("TFIDF")) searcher.setSimilarity(new ClassicSimilarity()); else if (sim.equals("BM25")) searcher.setSimilarity(new BM25Similarity()); else/*from w ww . j a va 2s . com*/ searcher.setSimilarity(new BM25Similarity()); String field = "contents"; QueryParser parser = new QueryParser(field, analyzer); Query query = parser.parse(parser.escape(question)); BooleanQuery.Builder bqb = new BooleanQuery.Builder(); bqb.add(new TermQuery(new Term("contents", parser.escape(question))), BooleanClause.Occur.SHOULD); bqb.add(new TermQuery(new Term("sec", parser.escape(question))), BooleanClause.Occur.SHOULD); // Term term = new Term(field, question); // Query query = new TermQuery(term); // TopDocs results = searcher.search(query, numResult); TopDocs results = searcher.search(parser.parse(bqb.build().toString()), numResult); ScoreDoc[] hits = results.scoreDocs; List<Document> docs = new ArrayList<Document>(); int numTotalHits = results.totalHits; // System.out.println(numTotalHits + " total matching documents"); int end = Math.min(numTotalHits, numResult); String searchResult = ""; // System.out.println("Only results 1 - " + hits.length); for (int i = 0; i < end; i++) { Document doc = searcher.doc(hits[i].doc); docs.add(doc); } return docs; }
From source file:ai.castor.idf.FetchTermIDF.java
License:Apache License
public double getTermIDF(String term) throws ParseException { Analyzer analyzer = new EnglishAnalyzer(CharArraySet.EMPTY_SET); QueryParser qp = new QueryParser(FIELD_BODY, analyzer); ClassicSimilarity similarity = new ClassicSimilarity(); String esTerm = qp.escape(term); double termIDF = 0.0; try {/*from w ww . j ava2 s . com*/ TermQuery q = (TermQuery) qp.parse(esTerm); Term t = q.getTerm(); termIDF = similarity.idf(reader.docFreq(t), reader.numDocs()); System.out.println(term + '\t' + esTerm + '\t' + q + '\t' + t + '\t' + termIDF); } catch (Exception e) { System.err.println("Exception in fetching IDF(" + term + "): " + e.toString()); } return termIDF; }
From source file:ai.castor.idf.IDFScorer.java
License:Apache License
public double calcIDF(String query, String answer, boolean analyze) throws ParseException { Analyzer analyzer;/*w w w . j a v a 2 s . co m*/ if (analyze) { analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords)); } else { analyzer = new WhitespaceAnalyzer(); } QueryParser qp = new QueryParser(FIELD_BODY, analyzer); ClassicSimilarity similarity = new ClassicSimilarity(); String escapedQuery = qp.escape(query); Query question = qp.parse(escapedQuery); HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().split("\\s+"))); double idf = 0.0; HashSet<String> seenTerms = new HashSet<>(); String[] terms = answer.split("\\s+"); for (String term : terms) { try { TermQuery q = (TermQuery) qp.parse(term); Term t = q.getTerm(); if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) { idf += similarity.idf(reader.docFreq(t), reader.numDocs()); seenTerms.add(t.toString()); } else { idf += 0.0; } } catch (Exception e) { continue; } } return idf; }
From source file:ca.mcgill.cs.creco.logic.search.CategorySearch.java
License:Apache License
/** * Constructor./*from ww w . j av a 2s . c om*/ * @param pDataStore The database whose categories will be in the search index. * @throws IOException If an exception is thrown during the creation of the product index. */ @Autowired public CategorySearch(IDataStore pDataStore) throws IOException { aDirectory = new RAMDirectory(); aAnalyzer = new EnglishAnalyzer(VERSION); aDataStore = pDataStore; buildCategoryIndex(); }
From source file:ch.admin.isb.hermes5.business.search.AnalyserRepository.java
License:Apache License
public Analyzer getAnalyzer(String lang) { if (lang.equals("fr")) { return new FrenchAnalyzer(Version.LUCENE_47); }//www .j a va2 s.c om if (lang.equals("it")) { return new ItalianAnalyzer(Version.LUCENE_47); } if (lang.equals("en")) { return new EnglishAnalyzer(Version.LUCENE_47); } return new GermanAnalyzer(Version.LUCENE_47); }
From source file:com.tamingtext.classifier.mlt.MoreLikeThisQueryTest.java
License:Apache License
@Test public void testMoreLikeThisQuery() throws Exception { //<start id="lucene.examples.mlt.setup"/> Directory directory = FSDirectory.open(new File(modelPath)); IndexReader indexReader = IndexReader.open(directory); //<co id="mlt.indexsetup"/> IndexSearcher indexSearcher = new IndexSearcher(indexReader); Analyzer analyzer //<co id="mlt.analyzersetup"/> = new EnglishAnalyzer(Version.LUCENE_36); if (nGramSize > 1) { //<co id="mlt.ngramsetup"/> analyzer = new ShingleAnalyzerWrapper(analyzer, nGramSize, nGramSize); }//from w ww. j a v a 2 s . c o m MoreLikeThis moreLikeThis = new MoreLikeThis(indexReader); //<co id="mlt.configure"/> moreLikeThis.setAnalyzer(analyzer); moreLikeThis.setFieldNames(new String[] { "content" }); /*<calloutlist> <callout arearefs="mlt.indexsetup">Open Index</callout> <callout arearefs="mlt.analyzersetup">Setup Analyzer</callout> <callout arearefs="mlt.ngramsetup">Setup NGrams</callout> <callout arearefs="mlt.configure">Create <classname>MoreLikeThis</classname></callout> </calloutlist>*/ //<end id="lucene.examples.mlt.setup"/> // for testing against the same corpus moreLikeThis.setMinTermFreq(1); moreLikeThis.setMinDocFreq(1); //<start id="lucene.examples.mlt.query"/> Reader reader = new FileReader(inputPath); //<co id="mlt.query"/> Query query = moreLikeThis.like(reader); TopDocs results = indexSearcher.search(query, maxResults); //<co id="mlt.search"/> HashMap<String, CategoryHits> categoryHash = new HashMap<String, CategoryHits>(); for (ScoreDoc sd : results.scoreDocs) { //<co id="mlt.collect"/> Document d = indexReader.document(sd.doc); Fieldable f = d.getFieldable(categoryFieldName); String cat = f.stringValue(); CategoryHits ch = categoryHash.get(cat); if (ch == null) { ch = new CategoryHits(); ch.setLabel(cat); categoryHash.put(cat, ch); } ch.incrementScore(sd.score); } SortedSet<CategoryHits> sortedCats //<co id="mlt.rank"/> = new TreeSet<CategoryHits>(CategoryHits.byScoreComparator()); sortedCats.addAll(categoryHash.values()); for (CategoryHits c : sortedCats) { //<co id="mlt.display"/> System.out.println(c.getLabel() + "\t" + c.getScore()); } /*<calloutlist> <callout arearefs="mlt.query">Create Query</callout> <callout arearefs="mlt.search">Perform Search</callout> <callout arearefs="mlt.collect">Collect Results</callout> <callout arearefs="mlt.rank">Rank Categories</callout> <callout arearefs="mlt.display">Display Categories</callout> </calloutlist>*/ //<end id="lucene.examples.mlt.query"/> }
From source file:com.tamingtext.classifier.mlt.TestMoreLikeThis.java
License:Apache License
public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option helpOpt = DefaultOptionCreator.helpOption(); Option inputDirOpt = obuilder.withLongName("input").withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription("The input directory").withShortName("i").create(); Option modelOpt = obuilder.withLongName("model").withRequired(true) .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create()) .withDescription("The directory containing the index model").withShortName("m").create(); Option categoryFieldOpt = obuilder.withLongName("categoryField").withRequired(true) .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create()) .withDescription("Name of the field containing category information").withShortName("catf") .create();//from w w w . ja v a 2s. com Option contentFieldOpt = obuilder.withLongName("contentField").withRequired(true) .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create()) .withDescription("Name of the field containing content information").withShortName("contf") .create(); Option maxResultsOpt = obuilder.withLongName("maxResults").withRequired(false) .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create()) .withDescription("Number of results to retrive, default: 10 ").withShortName("r").create(); Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(false) .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create()) .withDescription("Size of the n-gram. Default Value: 1 ").withShortName("ng").create(); Option typeOpt = obuilder.withLongName("classifierType").withRequired(false) .withArgument(abuilder.withName("classifierType").withMinimum(1).withMaximum(1).create()) .withDescription("Type of classifier: knn|tfidf. Default: bayes").withShortName("type").create(); Group group = gbuilder.withName("Options").withOption(gramSizeOpt).withOption(helpOpt) .withOption(inputDirOpt).withOption(modelOpt).withOption(typeOpt).withOption(contentFieldOpt) .withOption(categoryFieldOpt).withOption(maxResultsOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String classifierType = (String) cmdLine.getValue(typeOpt); int gramSize = 1; if (cmdLine.hasOption(gramSizeOpt)) { gramSize = Integer.parseInt((String) cmdLine.getValue(gramSizeOpt)); } int maxResults = 10; if (cmdLine.hasOption(maxResultsOpt)) { maxResults = Integer.parseInt((String) cmdLine.getValue(maxResultsOpt)); } String inputPath = (String) cmdLine.getValue(inputDirOpt); String modelPath = (String) cmdLine.getValue(modelOpt); String categoryField = (String) cmdLine.getValue(categoryFieldOpt); String contentField = (String) cmdLine.getValue(contentFieldOpt); MatchMode mode; if ("knn".equalsIgnoreCase(classifierType)) { mode = MatchMode.KNN; } else if ("tfidf".equalsIgnoreCase(classifierType)) { mode = MatchMode.TFIDF; } else { throw new IllegalArgumentException("Unkown classifierType: " + classifierType); } Directory directory = FSDirectory.open(new File(modelPath)); IndexReader indexReader = IndexReader.open(directory); Analyzer analyzer //<co id="mlt.analyzersetup"/> = new EnglishAnalyzer(Version.LUCENE_36); MoreLikeThisCategorizer categorizer = new MoreLikeThisCategorizer(indexReader, categoryField); categorizer.setAnalyzer(analyzer); categorizer.setMatchMode(mode); categorizer.setFieldNames(new String[] { contentField }); categorizer.setMaxResults(maxResults); categorizer.setNgramSize(gramSize); File f = new File(inputPath); if (!f.isDirectory()) { throw new IllegalArgumentException(f + " is not a directory or does not exit"); } File[] inputFiles = FileUtil.buildFileList(f); String line = null; //<start id="lucene.examples.mlt.test"/> final ClassifierResult UNKNOWN = new ClassifierResult("unknown", 1.0); ResultAnalyzer resultAnalyzer = //<co id="co.mlt.ra"/> new ResultAnalyzer(categorizer.getCategories(), UNKNOWN.getLabel()); for (File ff : inputFiles) { //<co id="co.mlt.read"/> BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(ff), "UTF-8")); while ((line = in.readLine()) != null) { String[] parts = line.split("\t"); if (parts.length != 2) { continue; } CategoryHits[] hits //<co id="co.mlt.cat"/> = categorizer.categorize(new StringReader(parts[1])); ClassifierResult result = hits.length > 0 ? hits[0] : UNKNOWN; resultAnalyzer.addInstance(parts[0], result); //<co id="co.mlt.an"/> } in.close(); } System.out.println(resultAnalyzer.toString());//<co id="co.mlt.print"/> /* <calloutlist> <callout arearefs="co.mlt.ra">Create <classname>ResultAnalyzer</classname></callout> <callout arearefs="co.mlt.read">Read Test data</callout> <callout arearefs="co.mlt.cat">Categorize</callout> <callout arearefs="co.mlt.an">Collect Results</callout> <callout arearefs="co.mlt.print">Display Results</callout> </calloutlist> */ //<end id="lucene.examples.mlt.test"/> } catch (OptionException e) { log.error("Error while parsing options", e); } }
From source file:com.tamingtext.classifier.mlt.TrainMoreLikeThis.java
License:Apache License
protected void openIndexWriter(String pathname) throws IOException { //<start id="lucene.examples.index.setup"/> Directory directory //<co id="luc.index.dir"/> = FSDirectory.open(new File(pathname)); Analyzer analyzer //<co id="luc.index.analyzer"/> = new EnglishAnalyzer(Version.LUCENE_36); if (nGramSize > 1) { //<co id="luc.index.shingle"/> ShingleAnalyzerWrapper sw = new ShingleAnalyzerWrapper(analyzer, nGramSize, // min shingle size nGramSize, // max shingle size "-", // token separator true, // output unigrams true); // output unigrams if no shingles analyzer = sw;//w w w . ja va 2 s .co m } IndexWriterConfig config //<co id="luc.index.create"/> = new IndexWriterConfig(Version.LUCENE_36, analyzer); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(directory, config); /* <calloutlist> <callout arearefs="luc.index.dir">Create Index Directory</callout> <callout arearefs="luc.index.analyzer">Setup Analyzer</callout> <callout arearefs="luc.index.shingle">Setup Shingle Filter</callout> <callout arearefs="luc.index.create">Create <classname>IndexWriter</classname></callout> </calloutlist> */ //<end id="lucene.examples.index.setup"/> this.writer = writer; }