List of usage examples for org.apache.lucene.analysis.tokenattributes CharTermAttribute buffer
public char[] buffer();
From source file:at.ac.univie.mminf.luceneSKOS.analysis.AbstractMeSHFilter.java
License:Apache License
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException { TokenStream ts = analyzer.tokenStream("", new StringReader(text)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); // PositionIncrementAttribute posIncAtt = // ts.addAttribute(PositionIncrementAttribute.class); ts.reset();//w w w.ja v a 2s.c o m reuse.length = 0; while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } // if (posIncAtt.getPositionIncrement() != 1) { // throw new IllegalArgumentException("term: " + text + // " analyzed to a token with posinc != 1"); // } reuse.grow(reuse.length + length + 1); /* current + word + separator */ int end = reuse.offset + reuse.length; if (reuse.length > 0) { reuse.chars[end++] = 32; // space reuse.length++; } System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length); reuse.length += length; } ts.end(); ts.close(); if (reuse.length == 0) { throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); } return reuse; }
From source file:at.ac.univie.mminf.luceneSKOS.analysis.SNOMEDFilter.java
License:Apache License
public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException { TokenStream ts = analyzer.tokenStream("", new StringReader(text)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); // PositionIncrementAttribute posIncAtt = // ts.addAttribute(PositionIncrementAttribute.class); boolean phraseTerm = false; ts.reset();//from www . j av a2 s . c om reuse.length = 0; while (ts.incrementToken()) { // System.out.println(text + " | " + termAtt.toString()); int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } // if (posIncAtt.getPositionIncrement() != 1) { // throw new IllegalArgumentException("term: " + text + // " analyzed to a token with posinc != 1"); // } reuse.grow(reuse.length + length + 1); /* * current + word + * separator */ int end = reuse.offset + reuse.length; if (reuse.length > 0) { reuse.chars[end++] = 32; // space reuse.length++; phraseTerm = true; } System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length); reuse.length += length; } ts.end(); ts.close(); if (reuse.length == 0) { throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); } if (phraseTerm) { reuse.grow(reuse.length + 2); /* current + word + separator */ reuse.length += 2; char next = reuse.chars[0]; for (int i = 0; i < reuse.length - 2; i++) { char tmp = reuse.chars[i + 1]; reuse.chars[i + 1] = next; next = tmp; } reuse.chars[0] = '\"'; reuse.chars[reuse.length - 1] = '\"'; } return reuse; }
From source file:com.billiger.solr.handler.component.QLTBComponent.java
License:Apache License
/** * Get analyzed version of the query string. * * This uses the analyzer for the configured FieldType for this * component to analyze and re-assemble the original query string. * If no queryFieldType is configured, the original query will be * returned.//from w w w .j av a2 s . c o m * * This is used both in the prepare() stage of the component and * when reading the QLTB map data. */ String getAnalyzedQuery(String query) throws IOException { if (analyzer == null) { return query; } StringBuilder norm = new StringBuilder(); TokenStream tokens = analyzer.tokenStream("", new StringReader(query)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } tokens.end(); tokens.close(); return norm.toString(); }
From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java
public void performWork(Path doc) throws IOException { try {// ww w . j av a2 s .c o m System.out.println("performing token work"); HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder part = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); System.out.println(key); String value = pair.getSecond().toString(); part.append(key); TokenStream stream = analyzer.tokenStream(key, new StringReader(value)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); stream.close(); tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(vectorsDir, part.toString()); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); System.out.println("wrote"); } } catch (Exception e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:com.clustertest2.clustertest2.vectorization.TokenBuilder.java
@Override public void performWork(Path inputDoc, Path outputDir) { try {// w ww . j av a 2 s . c o m HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder docName = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(inputDoc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); String value = pair.getSecond().toString(); docName.append(key); StringTuple document; try (TokenStream stream = analyzer.tokenStream(key, new StringReader(value))) { CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); } tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(outputDir, docName.toString()); // overwrite old vector file ClusterFileService.FS.delete(tokenizedSeq, true); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); } } catch (IOException e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException { String sContent = value.getText(); if (sContent == null) { // no text available? skip context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1); return;/*w w w. j a va2 s.c o m*/ } TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(sContent.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringTuple document = new StringTuple(); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } context.write(key, document); }
From source file:com.doculibre.constellio.lucene.BaseLuceneIndexHelper.java
License:Open Source License
public static String analyze(String str, Analyzer analyzer) throws IOException { if (analyzer == null) { return str; }//from w w w. j a va 2s. c o m StringBuilder norm = new StringBuilder(); TokenStream tokens = analyzer.tokenStream("", new StringReader(str)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } return norm.toString(); }
From source file:com.doculibre.constellio.utils.AnalyzerUtils.java
License:Open Source License
public static String analyzePhrase(String phrase, boolean useStopWords) { if (StringUtils.isNotBlank(phrase)) { String analysedPhrase;// ww w. j av a 2 s .co m Analyzer analyzer = getDefaultAnalyzer(useStopWords); StringBuilder norm = new StringBuilder(); TokenStream tokens; try { tokens = analyzer.tokenStream("", new StringReader(phrase)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } analysedPhrase = norm.toString().trim(); } catch (IOException e) { throw new RuntimeException(e); } return analysedPhrase; } else { return phrase; } }
From source file:com.ginobefunny.elasticsearch.plugins.synonym.service.SimpleSynonymMap.java
License:Apache License
private Set<String> analyze(String text) throws IOException { Set<String> result = new HashSet<String>(); Analyzer analyzer = configuration.getAnalyzer(); try (TokenStream ts = analyzer.tokenStream("", text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset();// w ww .j av a 2 s . co m while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } if (posIncAtt.getPositionIncrement() != 1) { throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1"); } result.add(new String(termAtt.buffer(), 0, termAtt.length())); } ts.end(); return result; } }
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
private String tokenizerToString(Tokenizer tokenizer) throws Exception { OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class); CharTermAttribute term = (CharTermAttribute) tokenizer.addAttribute(CharTermAttribute.class); TypeAttribute type = (TypeAttribute) tokenizer.addAttribute(TypeAttribute.class); SemanticClassAttribute semanticClass = (SemanticClassAttribute) tokenizer .addAttribute(SemanticClassAttribute.class); PartOfSpeechAttribute pos = (PartOfSpeechAttribute) tokenizer.addAttribute(PartOfSpeechAttribute.class); StringBuilder result = new StringBuilder(); while (tokenizer.incrementToken() == true) { result.append(new String(term.buffer(), 0, term.length())).append(":"); result.append(type.type()).append(":"); result.append(pos.partOfSpeech()).append(":"); result.append(semanticClass.semanticClass()).append(":"); result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":"); result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":"); result.append(String.valueOf(extOffset.startOffset())).append(":"); result.append(String.valueOf(extOffset.endOffset())); result.append(","); }// ww w. j a v a 2 s . com tokenizer.end(); return result.toString(); }