List of usage examples for org.apache.lucene.analysis.core StopAnalyzer StopAnalyzer
StopAnalyzer
From source file:br.edu.utfpr.cm.JGitMinerWeb.util.LuceneUtil.java
public static List<String> tokenizeString(String linha) { Analyzer analyzer = new StopAnalyzer(); List<String> result = new ArrayList<>(); try {/*from w ww . j a v a 2s .c om*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(linha)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { System.out.println(e.getMessage()); } return result; }
From source file:com.faqit.similarity.NGramExtractor.java
License:Open Source License
/** * Extracts NGrams from a String of text. Can handle ngrams of any length * and also perform stop word removal before extraction * //from w ww . j a va 2 s . co m * @param text * the text that the ngrams should be extracted from * @param length * the length of the ngrams * @param stopWords * whether or not stopwords should be removed before extraction * @param overlap * whether or not the ngrams should overlap */ public void extract(String text, int length, Boolean stopWords, Boolean overlap) throws FileNotFoundException, IOException { this.text = text; this.length = length; this.stopWords = stopWords; this.overlap = overlap; nGrams = new LinkedList<String>(); uniqueNGrams = new LinkedList<String>(); nGramFreqs = new HashMap<String, Integer>(); /* * If the minLength and maxLength are both 1, then we want unigrams Make * use of a StopAnalyzer when stopwords should be removed Make use of a * SimpleAnalyzer when stop words should be included */ if (length == 1) { if (this.stopWords) { analyzer = new StandardAnalyzer(); } else { analyzer = new SimpleAnalyzer(); } } else { // Bigger than unigrams so use ShingleAnalyzerWrapper. Once // again, different analyzers depending on stop word removal if (this.stopWords) { analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(), length, length, " ", false, false, ""); // This is a // hack to use // Lucene 2.4 // since in 2.4 // position // increments // weren't // preserved by // default. // Using a later // version puts // underscores // (_) in the // place of // removed stop // words. } else { analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), length, length, " ", false, false, ""); } } // Code to process and extract the ngrams TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text)); // OffsetAttribute offsetAttribute = // tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); // int tokenCount = 0; tokenStream.reset(); while (tokenStream.incrementToken()) { // int startOffset = offsetAttribute.startOffset(); // int endOffset = offsetAttribute.endOffset(); String termToken = charTermAttribute.toString(); // The actual token // term nGrams.add(termToken); // Add all ngrams to the ngram LinkedList // If n-grams are not allowed to overlap, then increment to point of // no overlap if (!overlap) { for (int i = 0; i < length - 1; i++) { tokenStream.incrementToken(); } } } // Store unique nGrams and frequencies in hash tables for (String nGram : nGrams) { if (nGramFreqs.containsKey(nGram)) { nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1); } else { nGramFreqs.put(nGram, 1); uniqueNGrams.add(nGram); } } }
From source file:com.hurence.logisland.processor.MatchQuery.java
License:Apache License
@Override public void init(final ProcessContext context) { keywordAnalyzer = new KeywordAnalyzer(); standardAnalyzer = new StandardAnalyzer(); stopAnalyzer = new StopAnalyzer(); matchingRules = new HashMap<>(); NumericQueryParser queryMatcher = new NumericQueryParser("field"); // loop over dynamic properties to add rules for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) { if (!entry.getKey().isDynamic()) { continue; }/*from w w w. j a v a2 s .c om*/ final String name = entry.getKey().getName(); final String query = entry.getValue(); matchingRules.put(name, new MatchingRule(name, query)); } try { monitor = new Monitor(queryMatcher, new TermFilteredPresearcher()); // TODO infer numeric type here if (context.getPropertyValue(NUMERIC_FIELDS).isSet()) { final String[] numericFields = context.getPropertyValue(NUMERIC_FIELDS).asString().split(","); for (String numericField : numericFields) { queryMatcher.setNumericField(numericField); } } //monitor = new Monitor(new LuceneQueryParser("field"), new TermFilteredPresearcher()); for (MatchingRule rule : matchingRules.values()) { MonitorQuery mq = new MonitorQuery(rule.getName(), rule.getQuery()); monitor.update(mq); } } catch (IOException e) { e.printStackTrace(); } }
From source file:HW1.generateIndex.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { String filePath = "/Users/yangyang/Desktop/lucene/corpus"; File folder = new File(filePath); File[] files = folder.listFiles(); String[] fields = { "DOCNO", "HEAD", "BYLINE", "DATELINE", "TEXT" }; ArrayList<HashMap<String, String>> documents = new ArrayList<HashMap<String, String>>(); int num = 0;//w ww . j a v a 2 s. co m for (File file : files) { // read each file BufferedReader br = null; String line; try { br = new BufferedReader(new FileReader(file)); String xmlRecords = ""; while ((line = br.readLine()) != null) { // change "&" to "&" to avoid bug in parse XML if (line.contains("&")) { line = line.replaceAll("&", "&"); } if (line.startsWith("<DOC>")) { xmlRecords = line; } else if (line.startsWith("</DOC>")) { xmlRecords += line; // use ReadXMLFile.java to parse the XMLfile string num += 1; ReadXMLFile r = new ReadXMLFile(); HashMap<String, String> document = r.parse(xmlRecords, fields); // System.out.println(document.toString()); documents.add(document); } else { xmlRecords += line + " "; } } } catch (Exception e) { e.printStackTrace(); } } System.out.println(num); String[] indexPaths = { "/Users/yangyang/Desktop/lucene/index/index01", "/Users/yangyang/Desktop/lucene/index/index02", "/Users/yangyang/Desktop/lucene/index/index03", "/Users/yangyang/Desktop/lucene/index/index04", }; for (String indexPath : indexPaths) { try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = null; if (indexPath.endsWith("1")) { analyzer = new KeywordAnalyzer(); } else if (indexPath.endsWith("2")) { analyzer = new SimpleAnalyzer(); } else if (indexPath.endsWith("3")) { analyzer = new StopAnalyzer(); } else if (indexPath.endsWith("4")) { analyzer = new StandardAnalyzer(); } IndexWriterConfig iwc = new IndexWriterConfig(analyzer); iwc.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, iwc); for (HashMap<String, String> doc : documents) { indexDoc(writer, doc); } writer.close(); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } } }
From source file:index.ForwardIndex.java
public Map<String, List<String>> createForwardIndex() { try {// w ww. jav a 2 s.c o m List<String> segment = new ArrayList<String>(); String query = "select * from pages"; ResultSet rs = db.executeQuery(query); String url; String file; int offset = 0; System.out.println("Start creating forward index"); while (rs.next()) { url = rs.getString("url"); file = rs.getString("raw"); offset = Integer.parseInt(rs.getString("offset")); String content = analyser.getContent(file, offset); Analyzer analyzer = new StopAnalyzer(); segment = LuceneUtil.tokenizeString(analyzer, content); indexMap.put(url, segment); } rs.close(); } catch (SQLException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } System.out.println("Finished, the size of forward index is " + indexMap.size()); return indexMap; }
From source file:irlucene.Main.java
public static void main(String args[]) { metricsMeanCFC(new StopAnalyzer(), 0); //metricsMeanMED(new StopAnalyzer()); }
From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java
License:Apache License
public static void analyzerByStop(String sentence) throws Exception { Analyzer analyzer = new StopAnalyzer(); TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();/*w ww . j a v a 2 s .c om*/ while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream .getAttribute(CharTermAttribute.class); System.out.print(charTermAttribute.toString() + " ,"); } analyzer.close(); }
From source file:org.hibernate.search.test.envers.SearchAndEnversIntegrationTest.java
License:LGPL
private Query createLuceneQuery(String term, String value) { String searchQuery = term + ":" + value; QueryParser parser = new QueryParser(term, new StopAnalyzer()); Query luceneQuery;//from w w w .j a va 2 s. c o m try { luceneQuery = parser.parse(searchQuery); } catch (ParseException e) { throw new RuntimeException("Unable to parse query", e); } return luceneQuery; }
From source file:query.Response.java
public Response() { invertedIndex = new InvertedIndex(); invertedIndexMap = invertedIndex.createInvertedIndex(); analyzer = new StopAnalyzer(); resultGetter = new ResultGetter(); }
From source file:stroom.index.server.analyzer.AnalyzerFactory.java
License:Apache License
public static Analyzer create(final AnalyzerType analyzerType, final boolean caseSensitive) { switch (analyzerType) { case KEYWORD: return new KeywordAnalyzer(caseSensitive); case ALPHA:/* ww w. j ava 2s . c o m*/ return new AlphaAnalyzer(caseSensitive); case ALPHA_NUMERIC: return new AlphaNumericAnalyzer(caseSensitive); case NUMERIC: return new NumericAnalyzer(); case WHITESPACE: return new WhitespaceAnalyzer(); case STOP: return new StopAnalyzer(); case STANDARD: return new StandardAnalyzer(); } return new KeywordAnalyzer(true); }