List of usage examples for org.apache.lucene.analysis.tokenattributes CharTermAttribute setEmpty
public CharTermAttribute setEmpty();
From source file:com.fujitsu.ca.fic.caissepop.evaluation.TokenizeText.java
License:Apache License
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) { return null; }//from w ww . j a v a 2s .c o m DataBag bagOfTokens = bagFactory.newDefaultBag(); TokenStream tokenStream = null; try { String lineOfText = input.get(0).toString(); StringReader textInput = new StringReader(lineOfText); tokenStream = analyzer.tokenStream(noField, textInput); CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { Tuple termText = tupleFactory.newTuple(termAttribute.toString()); bagOfTokens.add(termText); termAttribute.setEmpty(); } } finally { if (tokenStream != null) { tokenStream.close(); } } return bagOfTokens; }
From source file:com.github.ippeiukai.externaltoken.lucene.analysis.TestPatternTokenizer.java
License:Apache License
/** * TODO: rewrite tests not to use string comparison. *//*from w w w . j ava2 s . c om*/ private static String tsToString(TokenStream in) throws IOException { StringBuilder out = new StringBuilder(); CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); // extra safety to enforce, that the state is not preserved and also // assign bogus values in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); while (in.incrementToken()) { out.append(termAtt.toString()); in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); out.append(' '); } if (out.length() > 0) out.deleteCharAt(out.length() - 1); in.close(); return out.toString(); }
From source file:com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { Set<String> stopwords = Dictionary .loadDictionary(new Path("file:///Users/xstevens/workspace/akela/stopwords-en.txt")); NGramEnglishAnalyzer analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer( Version.LUCENE_31, stopwords, false, true); TokenStream stream = analyzer.tokenStream("", new StringReader("When I was growing up this was so much fun.")); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { if (termAttr.length() > 0) { System.out.println(termAttr.toString()); termAttr.setEmpty(); }//from ww w. j a va 2 s . c o m } }
From source file:com.mozilla.grouperfish.pig.eval.text.NGramTokenize.java
License:Apache License
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; }//w ww . j a va 2 s. c o m if (analyzer == null) { String langCode = "en"; if (input.size() > 1) { loadDictionary((String) input.get(1)); } boolean stem = false; if (input.size() > 2) { stem = Boolean.parseBoolean((String) input.get(2)); } boolean outputUnigrams = false; if (input.size() > 3) { outputUnigrams = Boolean.parseBoolean((String) input.get(3)); } int minNGram = 2; if (input.size() > 4) { minNGram = Integer.parseInt((String) input.get(4)); } int maxNGram = 3; if (input.size() > 5) { maxNGram = Integer.parseInt((String) input.get(5)); } if (input.size() > 6) { langCode = (String) input.get(6); } if (stopwords != null && stopwords.size() != 0) { analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, stem, outputUnigrams, minNGram, maxNGram); } else { analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31, StandardAnalyzer.STOP_WORDS_SET, stem, outputUnigrams, minNGram, maxNGram); } } DataBag output = bagFactory.newDefaultBag(); TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0))); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { if (termAttr.length() > 0) { Tuple t = tupleFactory.newTuple(termAttr.toString()); output.add(t); termAttr.setEmpty(); } } return output; }
From source file:com.mozilla.grouperfish.pig.eval.text.Tokenize.java
License:Apache License
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; }/*from ww w .j a v a 2 s. c om*/ if (analyzer == null) { String langCode = "en"; if (input.size() > 1) { loadDictionary((String) input.get(1)); } boolean stem = false; if (input.size() > 2) { stem = Boolean.parseBoolean((String) input.get(2)); } if (input.size() > 3) { langCode = (String) input.get(3); } if (langCode.startsWith("zh") || langCode.startsWith("ja")) { analyzer = new org.apache.lucene.analysis.cjk.CJKAnalyzer(Version.LUCENE_31); } else if (langCode.startsWith("de")) { analyzer = new org.apache.lucene.analysis.de.GermanAnalyzer(Version.LUCENE_31); } else if (langCode.startsWith("es")) { analyzer = new org.apache.lucene.analysis.es.SpanishAnalyzer(Version.LUCENE_31); } else { if (stopwords != null && stopwords.size() > 0) { analyzer = new EnglishAnalyzer(Version.LUCENE_31, stopwords, stem); } else { analyzer = new EnglishAnalyzer(Version.LUCENE_31, stem); } } } DataBag output = bagFactory.newDefaultBag(); TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0))); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { if (termAttr.length() > 0) { Tuple t = tupleFactory.newTuple(termAttr.toString()); output.add(t); termAttr.setEmpty(); } } return output; }
From source file:com.mozilla.grouperfish.transforms.coclustering.lucene.analysis.en.NGramEnglishAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { // TODO: SMELLY: de-system-ify Set<String> stopwords = Dictionary .loadDictionary(new Path("file:///Users/xstevens/workspace/akela/stopwords-en.txt")); NGramEnglishAnalyzer analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, false, true); TokenStream stream = analyzer.tokenStream("", new StringReader("When I was growing up this was so much fun.")); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { if (termAttr.length() > 0) { System.out.println(termAttr.toString()); termAttr.setEmpty(); }/* ww w .j ava2s.co m*/ } }
From source file:com.mozilla.grouperfish.transforms.coclustering.pig.eval.text.NGramTokenize.java
License:Apache License
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; }//from w w w .j av a2 s .c o m if (analyzer == null) { if (input.size() > 1) { loadDictionary((String) input.get(1)); } boolean stem = false; if (input.size() > 2) { stem = Boolean.parseBoolean((String) input.get(2)); } boolean outputUnigrams = false; if (input.size() > 3) { outputUnigrams = Boolean.parseBoolean((String) input.get(3)); } int minNGram = 2; if (input.size() > 4) { minNGram = Integer.parseInt((String) input.get(4)); } int maxNGram = 3; if (input.size() > 5) { maxNGram = Integer.parseInt((String) input.get(5)); } if (stopwords != null && stopwords.size() != 0) { analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, stem, outputUnigrams, minNGram, maxNGram); } else { analyzer = new NGramEnglishAnalyzer(Version.LUCENE_31, StandardAnalyzer.STOP_WORDS_SET, stem, outputUnigrams, minNGram, maxNGram); } } DataBag output = bagFactory.newDefaultBag(); TokenStream stream = analyzer.tokenStream(NOFIELD, new StringReader((String) input.get(0))); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { if (termAttr.length() > 0) { Tuple t = tupleFactory.newTuple(termAttr.toString()); output.add(t); termAttr.setEmpty(); } } return output; }
From source file:com.sindicetech.siren.analysis.attributes.NodeNumericTermAttributeImpl.java
License:Open Source License
public boolean incrementShift(final CharTermAttribute termAtt) { // check if we reach end of the stream if (shift >= valueSize) { return false; }/* w ww. j a va2 s. c om*/ try { // generate the next token and update the char term attribute this.bytesRefToChar(termAtt); // increment shift for next token shift += precisionStep; return true; } catch (final IllegalArgumentException iae) { // return empty token before first or after last termAtt.setEmpty(); // ends the numeric tokenstream shift = valueSize; return false; } }
From source file:com.underthehood.weblogs.lucene.AutoPhrasingTokenFilter.java
License:Apache License
private void emit(char[] token) { //System.out.println("emit: " + new String(token)); if (replaceWhitespaceWith != null) { token = replaceWhiteSpace(token); }/* ww w. j a v a 2 s. c o m*/ CharTermAttribute termAttr = getTermAttribute(); termAttr.setEmpty(); termAttr.append(new StringBuilder().append(token)); OffsetAttribute offAttr = getOffsetAttribute(); if (offAttr != null && offAttr.endOffset() >= token.length) { int start = offAttr.endOffset() - token.length; offAttr.setOffset(start, offAttr.endOffset()); } PositionIncrementAttribute pia = getPositionIncrementAttribute(); if (pia != null) { pia.setPositionIncrement(++positionIncr); } lastEmitted = token; }
From source file:org.elasticsearch.index.analysis.PatternTokenizerTests.java
License:Apache License
/** * TODO: rewrite tests not to use string comparison. *//*from w ww .j a va 2 s . c o m*/ private static String tsToString(TokenStream in) throws IOException { StringBuilder out = new StringBuilder(); CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class); // extra safety to enforce, that the state is not preserved and also // assign bogus values in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); in.reset(); while (in.incrementToken()) { if (out.length() > 0) out.append(' '); out.append(termAtt.toString()); in.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); } in.close(); return out.toString(); }