List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:com.cloudera.knittingboar.utils.DatasetConverter.java
License:Apache License
public static String ReadFullFile(Analyzer analyzer, String newsgroup_name, String file) throws IOException { String out = newsgroup_name + "\t"; BufferedReader reader = null; // Collection<String> words Multiset<String> words = ConcurrentHashMultiset.create(); try {//from w w w. java 2s. c o m reader = new BufferedReader(new FileReader(file)); TokenStream ts = analyzer.tokenStream("text", reader); ts.addAttribute(CharTermAttribute.class); // for each word in the stream, minus non-word stuff, add word to // collection while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); out += s + " "; } } finally { if (reader != null) { reader.close(); } } return out + "\n"; }
From source file:com.dhamacher.sentimentanalysis4tweets.sentiment.Tokenizer.java
License:Apache License
/** * Retrieve the tokens in a String. Behaves like getTokens, but operates on * a string instead of a tweet object.//from w w w . j a v a 2 s. co m * * @param text The text to tokenize. * @return The tokens in the text. */ // Version 1 /*public LinkedList<String> getTokens (String text) { LinkedList<String> tokens = new LinkedList(); String[] words = text.split(" "); tokens.addAll(Arrays.asList(words)); return tokens; }*/ // Version 2 public static LinkedList<String> getTokens(String text) throws IOException { LinkedList<String> tokens = new LinkedList(); TokenStream ts = new StandardTokenizer(new StringReader(text)); TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); while (ts.incrementToken()) { tokens.add(termAtt.term()); //System.out.print(termAtt.term()); } return tokens; }
From source file:com.fujitsu.ca.fic.caissepop.evaluation.TokenizeText.java
License:Apache License
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) { return null; }// w ww .j a v a 2 s.c om DataBag bagOfTokens = bagFactory.newDefaultBag(); TokenStream tokenStream = null; try { String lineOfText = input.get(0).toString(); StringReader textInput = new StringReader(lineOfText); tokenStream = analyzer.tokenStream(noField, textInput); CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { Tuple termText = tupleFactory.newTuple(termAttribute.toString()); bagOfTokens.add(termText); termAttribute.setEmpty(); } } finally { if (tokenStream != null) { tokenStream.close(); } } return bagOfTokens; }
From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
private String analyzeQuery(String query, Analyzer analyzer) { if (analyzer != null && query != null && query.length() > 0) { TokenStream tokenStream = analyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(query)); StringBuilder newQueryB = new StringBuilder(); try {// www .j av a 2 s. co m tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); // OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); // TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); newQueryB.append(term.toString()); newQueryB.append(' '); } tokenStream.end(); return newQueryB.toString().trim(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } finally { try { tokenStream.close(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } } } return query; }
From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
/** * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query. * /*from w ww .j av a 2 s . c om*/ * @param synonymAnalyzer * @param solrParams * @return */ private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) { String origQuery = getQueryStringFromParser(); int queryLen = origQuery.length(); // TODO: make the token stream reusable? TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(origQuery)); SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create(); boolean constructPhraseQueries = solrParams.getBool(Params.SYNONYMS_CONSTRUCT_PHRASES, false); boolean bag = solrParams.getBool(Params.SYNONYMS_BAG, false); List<String> synonymBag = new ArrayList<>(); try { tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); if (!typeAttribute.type().equals("shingle")) { // ignore shingles; we only care about synonyms and the original text // TODO: filter other types as well String termToAdd = term.toString(); if (typeAttribute.type().equals("SYNONYM")) { synonymBag.add(termToAdd); } // Don't quote sibgle term term synonyms if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM") && termToAdd.contains(" ")) { // Don't Quote when original is already surrounded by quotes if (offsetAttribute.startOffset() == 0 || offsetAttribute.endOffset() == queryLen || origQuery.charAt(offsetAttribute.startOffset() - 1) != '"' || origQuery.charAt(offsetAttribute.endOffset()) != '"') { // make a phrase out of the synonym termToAdd = new StringBuilder(termToAdd).insert(0, '"').append('"').toString(); } } if (!bag) { // create a graph of all possible synonym combinations, // e.g. dog bite, hound bite, dog nibble, hound nibble, etc. TextInQuery textInQuery = new TextInQuery(termToAdd, offsetAttribute.startOffset(), offsetAttribute.endOffset()); startPosToTextsInQuery.put(offsetAttribute.startOffset(), textInQuery); } } } tokenStream.end(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } finally { try { tokenStream.close(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } } List<String> alternateQueries = synonymBag; if (!bag) { // use a graph rather than a bag List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<>(startPosToTextsInQuery.values().size()); sortedTextsInQuery.addAll(startPosToTextsInQuery.asMap().values().stream().map(ArrayList::new) .collect(Collectors.toList())); // have to use the start positions and end positions to figure out all possible combinations alternateQueries = buildUpAlternateQueries(solrParams, sortedTextsInQuery); } // save for debugging purposes expandedSynonyms = alternateQueries; return createSynonymQueries(solrParams, alternateQueries); }
From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java
License:Apache License
private void readBaseForm(TokenStream tokenStream, LuceneToken token) { BaseFormAttribute baseForm = tokenStream.getAttribute(BaseFormAttribute.class); if (baseForm != null) { token.setBaseForm(baseForm.getBaseForm()); }//from w w w . jav a 2 s . co m }
From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java
License:Apache License
private void readInflection(TokenStream tokenStream, LuceneToken token) { InflectionAttribute inflection = tokenStream.getAttribute(InflectionAttribute.class); if (inflection != null) { token.setInflectionForm(LuceneUtil.translateInflectedForm(inflection.getInflectionForm())); token.setInflectionType(LuceneUtil.translateInflectionType(inflection.getInflectionType())); }// ww w.ja v a 2 s. c o m }
From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java
License:Apache License
private void readPartOfSpeech(TokenStream tokenStream, LuceneToken token) { PartOfSpeechAttribute partOfSpeech = tokenStream.getAttribute(PartOfSpeechAttribute.class); if (partOfSpeech != null) { String str = partOfSpeech.getPartOfSpeech(); if (str != null) { token.setPartOfSpeech(LuceneUtil.translatePartOfSpeech(str)); }//from w w w . ja v a2s . c om } }
From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java
License:Apache License
private void readReading(TokenStream tokenStream, LuceneToken token) { ReadingAttribute reading = tokenStream.getAttribute(ReadingAttribute.class); if (reading != null) { token.setPronunciation(reading.getPronunciation()); token.setReading(reading.getReading()); }/* w ww .jav a 2 s. c om*/ }
From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java
License:Apache License
private void readOffset(TokenStream tokenStream, LuceneToken token) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); if (offset != null) { token.setStartOffset(offset.startOffset()); token.setEndOffset(offset.endOffset()); }/*w w w .j a v a2s .com*/ }