List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:org.apache.uima.lucas.ProspectiveSearchAE.java
License:Apache License
@Override public void process(CAS aCAS) throws AnalysisEngineProcessException { // First create the index of the document text MemoryIndex index = new MemoryIndex(); List fields = createDocument(aCAS).getFields(); for (Iterator it = fields.iterator(); it.hasNext();) { Field field = (Field) it.next(); if (field.isIndexed() && field.tokenStreamValue() != null) { index.addField(field.name(), field.tokenStreamValue()); }/*from w w w. ja v a 2s . c o m*/ } // Search all queries against the one document index for (SearchQuery query : searchQueryProvider.getSearchQueries(aCAS)) { float score = index.search(query.query()); if (score > matchingThreshold) { // Add a FS to the CAS with the search result FeatureStructure searchResult = aCAS.createFS(searchResultType); searchResult.setLongValue(searchResultIdFeature, query.id()); aCAS.addFsToIndexes(searchResult); // Find matching tokens and link their annotations // in case the user wants search term highlighting if (searchResultMatchingTextFeature != null) { fields = createDocument(aCAS).getFields(); for (Iterator it = fields.iterator(); it.hasNext();) { Field field = (Field) it.next(); if (field.isIndexed() && field.tokenStreamValue() != null) { TokenStream tokenStream = field.tokenStreamValue(); Collection<AnnotationFS> matchingTextAnnotations = new LinkedList<AnnotationFS>(); QueryScorer scorer = new QueryScorer(query.query(), field.name()); scorer.startFragment(new TextFragment(new StringBuffer(aCAS.getDocumentText()), 0, 0)); try { scorer.init(tokenStream); OffsetAttribute offsetAttr = null; while (tokenStream.incrementToken()) { offsetAttr = (OffsetAttribute) tokenStream.getAttribute(OffsetAttribute.class); float tokenScore = scorer.getTokenScore(); if (tokenScore > 0) { AnnotationFS annotation = aCAS.createAnnotation(matchingTextType, offsetAttr.startOffset(), offsetAttr.endOffset()); matchingTextAnnotations.add(annotation); } } } catch (IOException e) { throw new AnalysisEngineProcessException(e); } ArrayFS matchtingTextArray = aCAS.createArrayFS(matchingTextAnnotations.size()); int matchtingTextArrayIndex = 0; for (AnnotationFS matchingTextAnnotation : matchingTextAnnotations) { matchtingTextArray.set(matchtingTextArrayIndex++, matchingTextAnnotation); } searchResult.setFeatureValue(searchResultMatchingTextFeature, matchtingTextArray); } } } } } }
From source file:org.apache.usergrid.utils.IndexUtils.java
License:Apache License
public static List<String> keywords(String source) { TokenStream ts = analyzer.tokenStream("keywords", new StringReader(source)); List<String> keywords = new ArrayList<String>(); try {// www . j a va 2s .c o m while (ts.incrementToken()) { keywords.add(ts.getAttribute(TermAttribute.class).term()); } } catch (IOException e) { LOG.error("Error getting keywords ", e); } return keywords; }
From source file:org.apdplat.evaluation.impl.SmartCNEvaluation.java
License:Open Source License
private static String segText(String text) { StringBuilder result = new StringBuilder(); try {/*from w w w . j a v a 2 s .c o m*/ TokenStream tokenStream = SMART_CHINESE_ANALYZER.tokenStream("text", new StringReader(text)); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); result.append(charTermAttribute.toString()).append(" "); } tokenStream.close(); } catch (Exception e) { e.printStackTrace(); } return result.toString(); }
From source file:org.apdplat.word.elasticsearch.ChineseWordIndicesAnalysisTest.java
License:Open Source License
@Test public void testChineseWordIndicesAnalysis() throws IOException { Index index = new Index("test"); Injector parentInjector = new ModulesBuilder().add(new SettingsModule(SETTINGS), new EnvironmentModule(new Environment(SETTINGS)), new IndicesAnalysisModule()).createInjector(); Injector injector = new ModulesBuilder() .add(new IndexSettingsModule(index, SETTINGS), new IndexNameModule(index), new AnalysisModule(SETTINGS, parentInjector.getInstance(IndicesAnalysisService.class)) .addProcessor(new ChineseWordAnalysisBinderProcessor())) .createChildInjector(parentInjector); AnalysisService analysisService = injector.getInstance(AnalysisService.class); TokenizerFactory tokenizerFactory = analysisService.tokenizer("word"); boolean match = (tokenizerFactory instanceof ChineseWordTokenizerFactory); assertTrue(match);/* w w w. java 2 s .c o m*/ Tokenizer tokenizer = tokenizerFactory.create(new StringReader("?")); String exp = "[, ?]"; List<String> result = new ArrayList<>(); while (tokenizer.incrementToken()) { CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class); result.add(charTermAttribute.toString()); } assertEquals(exp, result.toString()); Analyzer analyzer = analysisService.analyzer("word").analyzer(); match = (analyzer instanceof ChineseWordAnalyzer); assertTrue(match); TokenStream tokenStream = analyzer.tokenStream("text", "??APDPlat???"); exp = "[??, apdplat, , ??, ?, ]"; result = new ArrayList<>(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); result.add(charTermAttribute.toString()); } assertEquals(exp, result.toString()); }
From source file:org.apdplat.word.lucene.ChineseWordAnalyzer.java
License:Open Source License
public static void main(String args[]) throws IOException { Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??APDPlat???"); tokenStream.reset();//from ww w . j a va 2 s . c o m while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - " + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement()); } tokenStream.close(); tokenStream = analyzer.tokenStream("text", "word????????ysc"); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - " + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement()); } tokenStream.close(); tokenStream = analyzer.tokenStream("text", "5?"); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream .getAttribute(PositionIncrementAttribute.class); LOGGER.info(charTermAttribute.toString() + " (" + offsetAttribute.startOffset() + " - " + offsetAttribute.endOffset() + ") " + positionIncrementAttribute.getPositionIncrement()); } tokenStream.close(); }
From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java
License:Open Source License
@Test public void test1() { try {/*from w w w. j ava 2 s .c o m*/ Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??APDPlat???"); List<String> words = new ArrayList<>(); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); words.add(charTermAttribute.toString()); } tokenStream.close(); String expResult = "[??, , apdplat, , ?, ?, ?, , ]"; if ("bigram".equals(WordConfTools.get("ngram", "bigram"))) { expResult = "[??, , apdplat, , , ?, ?, ?, , ]"; } assertEquals(expResult, words.toString()); } catch (IOException e) { fail("?" + e.getMessage()); } }
From source file:org.apdplat.word.lucene.ChineseWordAnalyzerTest.java
License:Open Source License
@Test public void test2() { try {/*ww w .ja v a 2 s . co m*/ Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??"); List<String> words = new ArrayList<>(); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); words.add(charTermAttribute.toString()); } tokenStream.close(); String expResult = "[??, , , , , , ]"; assertEquals(expResult, words.toString()); } catch (IOException e) { fail("?" + e.getMessage()); } }
From source file:org.archive.porky.TokenizeTextUDF.java
License:Apache License
public String exec(Tuple input) throws IOException { String emptyString = ""; if (input == null || input.size() == 0) { return emptyString; }/*from w w w. ja va 2 s .c om*/ try { String textString = (String) input.get(0); if (textString == null) { return emptyString; } if (stopSet == null) { //initialize List<String> stopWords = new ArrayList<String>(); //read in stop words file // Open the file as a local file. FileReader fr = new FileReader(stopWordsFile); BufferedReader d = new BufferedReader(fr); String line; while ((line = d.readLine()) != null) { stopWords.add(line); } fr.close(); stopSet = new CharArraySet(Version.LUCENE_45, stopWords, true); } TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_45, new StringReader(textString)); tokenStream = new StopFilter(Version.LUCENE_45, tokenStream, stopSet); StringBuilder sb = new StringBuilder(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); sb.append(term + " "); } return sb.toString(); } catch (Exception e) { return emptyString; } }
From source file:org.betaconceptframework.astroboa.model.impl.query.xpath.XPathUtils.java
License:Open Source License
private static String analyzeTextToFind(String textToFind) throws IOException { // Filter textToFind through GreekAnalyzer TokenStream stream = greekAnalyzer.tokenStream("", new StringReader(textToFind)); stream.reset();// ww w. ja va 2s .com StringBuilder analyzedTextTofind = new StringBuilder(); try { while (stream.incrementToken()) { String term = stream.getAttribute(TermAttribute.class).term(); analyzedTextTofind.append(term); analyzedTextTofind.append(" "); } } catch (IOException e) { e.printStackTrace(); analyzedTextTofind.append(textToFind); } finally { stream.end(); stream.close(); } String result = analyzedTextTofind.toString().trim(); if (StringUtils.isBlank(result)) return textToFind; return result; }
From source file:org.bibsonomy.lucene.search.LuceneResourceSearch.java
License:Open Source License
/** * analyzes given input parameter/* ww w .jav a 2 s .com*/ * * @param fieldName the name of the field * @param param the value of the field * @return the analyzed string * @throws IOException */ protected String parseToken(final String fieldName, final String param) throws IOException { if (present(param)) { // use lucene's new token stream api (see org.apache.lucene.analysis' javadoc at package level) final TokenStream ts = this.getAnalyzer().tokenStream(fieldName, new StringReader(param)); final TermAttribute termAtt = ts.addAttribute(TermAttribute.class); ts.reset(); // analyze the parameter - that is: concatenate its normalized tokens final StringBuilder analyzedString = new StringBuilder(); while (ts.incrementToken()) { analyzedString.append(" ").append(termAtt.term()); } return analyzedString.toString().trim(); } return ""; }