List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:at.tuwien.ifs.somtoolbox.apps.viewer.DocViewPanel.java
License:Apache License
private void updateWeightHighlighting() { // remove previous highlighting removeHighLights(weightingHighLights); if (weightHighlightBox.isSelected()) { if (inputDataObjects.getTemplateVector() == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Template vector file needed for displaying weights. Load from the File->Data files menu"); weightHighlightBox.setSelected(false); return; }//from w ww .j a va 2 s. c o m if (inputDataObjects.getInputData() == null) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Input data file needed for displaying weights. Load from the File->Data files menu"); weightHighlightBox.setSelected(false); return; } SOMLibTemplateVector tv = inputDataObjects.getTemplateVector(); InputData data = inputDataObjects.getInputData(); InputDatum input = data.getInputDatum(currentInput); double maxValue = data.getMaxValue(); double minValue = data.getMinValue(); double span = maxValue - minValue; // init paints Palette p = paletteSelectionPanel.getSelectedPalette(); int paletteLength = p.getNumberOfColours(); weightPaints = new DefaultHighlighter.DefaultHighlightPainter[paletteLength]; for (int i = 0; i < weightPaints.length; i++) { weightPaints[i] = new DefaultHighlighter.DefaultHighlightPainter(p.getColor(i)); } String text = textPane.getText(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); try { while (stream.incrementToken()) { TypeAttribute typeAttribute = stream.getAttribute(TypeAttribute.class); if (!at.tuwien.ifs.somtoolbox.util.StringUtils.equalsAny(typeAttribute.type(), "<APOSTROPHE>")) { TermAttribute termAttribute = stream.getAttribute(TermAttribute.class); String term = termAttribute.term(); if (tv.containsLabel(term)) { int index = tv.getIndex(term); double value = input.getVector().getQuick(index); int colorIndex = (int) (paletteLength / 4d + relativeValue(minValue, span, value) * paletteLength / 2d); OffsetAttribute offsetAttribute = stream.getAttribute(OffsetAttribute.class); offsetAttribute.startOffset(); Object tag = highlighter.addHighlight(offsetAttribute.startOffset(), offsetAttribute.endOffset(), weightPaints[colorIndex]); weightingHighLights.add(tag); } } } } catch (IOException e) { e.printStackTrace(); } catch (BadLocationException e) { e.printStackTrace(); } } }
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.TweetAnalyzerBolt.java
License:Apache License
@Override public void execute(Tuple input) { try {/*from w w w . jav a 2 s .c o m*/ String tweet = (String) input.getValueByField(StreamIDs.TWEET); Reader reader = new StringReader(tweet); LanguageIdentifier identifier = new LanguageIdentifier(tweet); NewsRecLuceneAnalyzer analyzer = LanguageAnalyzerHelper.getInstance() .getAnalyzer(new Locale(identifier.getLanguage())); TokenStream tokenStream = analyzer.tokenStream("", reader); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); collector.emit(StreamIDs.TERMSTREAM, new Values(term)); } reader.close(); tokenStream.close(); for (String term : extractNames(tweet, analyzer.getStopwords())) { collector.emit(StreamIDs.TERMSTREAM, new Values(term)); } } catch (IOException ex) { logger.error(ex); } }
From source file:bixo.examples.webmining.PhraseShingleAnalyzer.java
License:Apache License
public List<String> getTermList(String contentText) { List<String> result = new ArrayList<String>(contentText.length() / 10); try {/* ww w . j a v a 2 s. c o m*/ TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText)); CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { String term = termAtt.toString(); result.add(term); } } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException("Impossible error", e); } return result; }
From source file:br.bireme.ngrams.Tools.java
public static void showTokens(final Analyzer analyzer, final String fieldName, final String text) throws IOException { TokenStream tokenStream = analyzer.tokenStream(fieldName, text); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();/*from w w w . j av a2s . c o m*/ while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); final String term = charTermAttribute.toString(); System.out.println(term + " [" + startOffset + "," + endOffset + "]"); } }
From source file:br.edu.utfpr.cm.JGitMinerWeb.services.matrix.auxiliary.LuceneUtil.java
public static List<String> tokenizeString(String linha) { Analyzer analyzer = new StopAnalyzer(Version.LUCENE_46); List<String> result = new ArrayList<>(); try {//w w w . j av a2 s . co m TokenStream stream = analyzer.tokenStream(null, new StringReader(linha)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { System.out.println(e.getMessage()); } return result; }
From source file:br.edu.utfpr.cm.JGitMinerWeb.util.LuceneUtil.java
public static List<String> tokenizeString(String linha) { Analyzer analyzer = new StopAnalyzer(); List<String> result = new ArrayList<>(); try {/* w ww. j a v a 2s .co m*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(linha)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { System.out.println(e.getMessage()); } return result; }
From source file:br.ufmt.harmonizacao.implementer.PatenteeSearcher.java
public List<String> search(String field, String value) { try {/* ww w .ja va 2s . co m*/ long start = System.currentTimeMillis(); TokenStream stream = analyzer.tokenStream(field, new StringReader(value)); CharTermAttribute attr = stream.getAttribute(CharTermAttribute.class); stream.reset(); String valor = ""; while (stream.incrementToken()) { valor = valor + attr.toString() + ' '; } BooleanQuery bq = new BooleanQuery(); BooleanQuery acronymBq = null; String query = ""; BooleanQuery wrapBq = new BooleanQuery(); String[] tokens = valor.split(" "); for (int i = 0; i < tokens.length; i++) { if (tokens.length >= 2) { acronymBq = new BooleanQuery(); switch (i) { case 0: acronymBq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); bq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD); break; case 1: acronymBq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST_NOT); bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD); bq.add(new LengthQuery(field, valor), BooleanClause.Occur.MUST_NOT); break; default: break; } } else { if (tokens[i].length() > 3) { bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); } else { bq.add(new TermQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); } } } stream.end(); stream.close(); // Aqui termina // Cria uma fuzzyquery, ela que far a busca de aproximao wrapBq.add(bq, BooleanClause.Occur.MUST); if (acronymBq != null) { //new QueryParser(Version.LUCENE_47, field, new StandardAnalyzer(Version.LUCENE_47)).parse(query) wrapBq.add(acronymBq, BooleanClause.Occur.MUST_NOT); } String queryTime = "Tempo para construo da query : " + (System.currentTimeMillis() - start) + "ms"; // Pegando os documentos encontrado na pesquisa start = System.currentTimeMillis(); ScoreDoc[] hits = searcher.search(wrapBq, 10).scoreDocs; String searchTime = "Tempo para busca : " + (System.currentTimeMillis() - start) + "ms"; List<String> result = new ArrayList<String>(); result.add(valor); if (hits.length > 0) { for (int i = 0; i < hits.length; i++) { Document hitDoc = searcher.doc(hits[i].doc); result.add(hitDoc.get(field)); } } result.add(queryTime); result.add(searchTime); return result; } catch (IOException ex) { Logger.getLogger(PatenteeSearcher.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:byrne.mitre.MitreQuery.java
License:Apache License
public void run() { try {//from w ww.ja v a 2 s .c o m TokenStream tokenStream = analyzer.tokenStream("ngrams", new StringReader(entry.getFullName())); BooleanQuery bq = new BooleanQuery(); while (tokenStream.incrementToken()) { Term t = new Term("ngrams", tokenStream.getAttribute(TermAttribute.class).term()); bq.add(new TermQuery(t), BooleanClause.Occur.SHOULD); } TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); searcher.search(bq, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); out.write(entry.getID() + "|" + d.get("id") + "|" + df.format(hits[i].score) + "\n"); } } catch (IOException IOE) { } }
From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java
License:Open Source License
/** * Filter the string with StandardAnalyzer. * @param str//from ww w . j ava 2 s . c om * @param removeStopWords Indicate if the stop words should be removed. * @return */ public static String processString(String str, boolean removeStopWords) { StringBuffer strBuf = new StringBuffer(); try { Analyzer analyzer = null; if (removeStopWords) analyzer = new StandardAnalyzer(Version.LUCENE_34); else analyzer = new TextAnalyzerWithStopwords(Version.LUCENE_34); TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(str)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); strBuf.append(term + " "); } analyzer.close(); } catch (Exception e) { e.printStackTrace(); } return strBuf.toString().trim(); }
From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java
License:Open Source License
/** * This function assumes that the TFIDF vector of the document containing text is already * given. We simply build a tfidf-vector of the text out of the docVector. * The purpose of doing this is to save the time computing the tf-idf value for words in * the same document.// w w w . jav a2 s. c o m * * @param text * @param docVector * @return */ public Map<String, Float> TextTFIDFVector(String text, Map<String, Float> docVector) { Map<String, Float> map = new HashMap<String, Float>(); //preprocess the text using StandardAnalyzer (StandardAnalyzer2 + StopAnalyzer). StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34); TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(text)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); if (docVector.containsKey(term)) map.put(term, docVector.get(term)); } } catch (Exception e) { e.printStackTrace(); } analyzer.close(); return map; }