List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:cn.edu.thss.iise.beehivez.server.index.luceneindex.analyzer.SemicolonAnalyzer.java
License:Open Source License
/** * @param args//from w ww .j a va 2s .c om */ public static void main(String[] args) throws IOException { // text to tokenize final String text = "This is a demo of , the new TokenStream API"; SemicolonAnalyzer analyzer = new SemicolonAnalyzer(); TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); // get the TermAttribute from the TokenStream TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println(termAtt.term()); } stream.end(); stream.close(); }
From source file:cn.edu.thss.iise.beehivez.server.util.StringSimilarityUtil.java
License:Open Source License
/** * tokenize the given string, all the words are extracted, lowercased, all * the stop words are removed, and all the words are replaced with their * stem//from ww w .jav a2 s . co m * * @param label * @return */ public static HashSet<String> snowballTokenize(String label) { HashSet<String> ret = new HashSet<String>(); try { Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English", StandardAnalyzer.STOP_WORDS_SET); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { ret.add(termAtt.term()); } stream.end(); stream.close(); } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:com.billiger.solr.handler.component.QLTBComponent.java
License:Apache License
/** * Get analyzed version of the query string. * * This uses the analyzer for the configured FieldType for this * component to analyze and re-assemble the original query string. * If no queryFieldType is configured, the original query will be * returned./*from ww w . j a v a 2 s . c o m*/ * * This is used both in the prepare() stage of the component and * when reading the QLTB map data. */ String getAnalyzedQuery(String query) throws IOException { if (analyzer == null) { return query; } StringBuilder norm = new StringBuilder(); TokenStream tokens = analyzer.tokenStream("", new StringReader(query)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } tokens.end(); tokens.close(); return norm.toString(); }
From source file:com.bizosys.unstructured.IndexWriter.java
License:Apache License
/** * Find the last offset.// w ww . j a va 2s . c o m * Find each term offset * * @param stream * @param docId * @param docType * @param fieldType * @param fieldBoost * @param codecs * @param uniqueTokens * @throws IOException */ private final void tokenize(TokenStream stream, int docId, int docType, DocumentMetadata filter, int fieldType, Map<String, IndexRow> uniqueTokens) throws IOException { String token = null; int curoffset = 0; int lastoffset = 0; int position = -1; StringBuilder sb = new StringBuilder(); CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class); while (stream.incrementToken()) { token = termA.toString(); curoffset = offsetA.endOffset(); if (lastoffset != curoffset) position++; lastoffset = curoffset; String key = IndexRow.generateKey(sb, docId, token, docType, fieldType, filter); sb.delete(0, sb.capacity()); if (uniqueTokens.containsKey(key)) { IndexRow existingRow = uniqueTokens.get(key); existingRow.set(curoffset, position); existingRow.occurance++; } else { IndexRow row = new IndexRow(docId, token, docType, fieldType, curoffset, position); if (null != filter) row.docMeta = filter; uniqueTokens.put(key, row); } } stream.end(); stream.close(); for (IndexRow row : uniqueTokens.values()) cachedIndex.add(row); }
From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java
public void performWork(Path doc) throws IOException { try {//from ww w .jav a 2 s . c om System.out.println("performing token work"); HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder part = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); System.out.println(key); String value = pair.getSecond().toString(); part.append(key); TokenStream stream = analyzer.tokenStream(key, new StringReader(value)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); stream.close(); tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(vectorsDir, part.toString()); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); System.out.println("wrote"); } } catch (Exception e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:com.flaptor.indextank.query.IndexEngineParser.java
License:Apache License
public Iterator<AToken> parseDocumentField(String fieldName, String content) { final TokenStream tkstream = analyzer.tokenStream(fieldName, new StringReader(content)); final TermAttribute termAtt = tkstream.addAttribute(TermAttribute.class); final PositionIncrementAttribute posIncrAttribute = tkstream.addAttribute(PositionIncrementAttribute.class); final OffsetAttribute offsetAtt = tkstream.addAttribute(OffsetAttribute.class); return new AbstractIterator<AToken>() { int currentPosition = 0; @Override//from w ww . j a v a 2 s . co m protected AToken computeNext() { try { if (!tkstream.incrementToken()) { tkstream.end(); tkstream.close(); return endOfData(); } } catch (IOException e) { //This should never happen, as the reader is a StringReader } //final org.apache.lucene.analysis.Token luceneTk = tkstream.getAttribute(org.apache.lucene.analysis.Token.class); currentPosition += posIncrAttribute.getPositionIncrement(); final int position = currentPosition; final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); final String text = termAtt.term(); return new AToken() { @Override public String getText() { return text; //luceneTk.term(); } @Override public int getPosition() { return position; //luceneTk.getPositionIncrement(); } @Override public int getStartOffset() { return startOffset; } @Override public int getEndOffset() { return endOffset; } }; } }; }
From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
private String analyzeQuery(String query, Analyzer analyzer) { if (analyzer != null && query != null && query.length() > 0) { TokenStream tokenStream = analyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(query)); StringBuilder newQueryB = new StringBuilder(); try {//from ww w. j av a 2s. c o m tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); // OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); // TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); newQueryB.append(term.toString()); newQueryB.append(' '); } tokenStream.end(); return newQueryB.toString().trim(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } finally { try { tokenStream.close(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } } } return query; }
From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
/** * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query. * /*from w w w. j a va 2s . c o m*/ * @param synonymAnalyzer * @param solrParams * @return */ private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) { String origQuery = getQueryStringFromParser(); int queryLen = origQuery.length(); // TODO: make the token stream reusable? TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(origQuery)); SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create(); boolean constructPhraseQueries = solrParams.getBool(Params.SYNONYMS_CONSTRUCT_PHRASES, false); boolean bag = solrParams.getBool(Params.SYNONYMS_BAG, false); List<String> synonymBag = new ArrayList<>(); try { tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); if (!typeAttribute.type().equals("shingle")) { // ignore shingles; we only care about synonyms and the original text // TODO: filter other types as well String termToAdd = term.toString(); if (typeAttribute.type().equals("SYNONYM")) { synonymBag.add(termToAdd); } // Don't quote sibgle term term synonyms if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM") && termToAdd.contains(" ")) { // Don't Quote when original is already surrounded by quotes if (offsetAttribute.startOffset() == 0 || offsetAttribute.endOffset() == queryLen || origQuery.charAt(offsetAttribute.startOffset() - 1) != '"' || origQuery.charAt(offsetAttribute.endOffset()) != '"') { // make a phrase out of the synonym termToAdd = new StringBuilder(termToAdd).insert(0, '"').append('"').toString(); } } if (!bag) { // create a graph of all possible synonym combinations, // e.g. dog bite, hound bite, dog nibble, hound nibble, etc. TextInQuery textInQuery = new TextInQuery(termToAdd, offsetAttribute.startOffset(), offsetAttribute.endOffset()); startPosToTextsInQuery.put(offsetAttribute.startOffset(), textInQuery); } } } tokenStream.end(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } finally { try { tokenStream.close(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } } List<String> alternateQueries = synonymBag; if (!bag) { // use a graph rather than a bag List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<>(startPosToTextsInQuery.values().size()); sortedTextsInQuery.addAll(startPosToTextsInQuery.asMap().values().stream().map(ArrayList::new) .collect(Collectors.toList())); // have to use the start positions and end positions to figure out all possible combinations alternateQueries = buildUpAlternateQueries(solrParams, sortedTextsInQuery); } // save for debugging purposes expandedSynonyms = alternateQueries; return createSynonymQueries(solrParams, alternateQueries); }
From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java
License:Apache License
public List<String> tokenize(String text) { List<String> words = new ArrayList<String>(); if (text != null && !text.isEmpty()) { TokenStream tokenStream = this.createTokenStream(text); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try {//from w w w. j a va 2 s .co m while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); words.add(term); } } catch (IOException ioe) { LOGGER.error("Unable to analyze text. Cause : " + ioe.getMessage(), ioe); } finally { try { tokenStream.end(); tokenStream.close(); } catch (IOException e) { // Can't do nothing!! LOGGER.error("Unable to close token stream : " + e.getMessage()); } } } return words; }
From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java
License:Apache License
private List<LuceneToken> readTokens(TokenStream tokenStream) throws IOException { ArrayList<LuceneToken> tokens = new ArrayList<LuceneToken>(); HashMap<Integer, LuceneToken> tokensByStartOffset = new HashMap<Integer, LuceneToken>(); addAttributes(tokenStream);/*from w ww . j ava 2s . c o m*/ tokenStream.reset(); while (tokenStream.incrementToken()) { if (tokenStream.hasAttributes()) { LuceneToken token = new LuceneToken(); readOffset(tokenStream, token); // Lucene may output multiple tokens for compound words LuceneToken tokenWithSameStartOffset = tokensByStartOffset.get(token.getStartOffset()); if (tokenWithSameStartOffset != null) { if (token.getEndOffset() >= tokenWithSameStartOffset.getEndOffset()) { continue; } else { tokens.remove(tokenWithSameStartOffset); } } readReading(tokenStream, token); readPartOfSpeech(tokenStream, token); readInflection(tokenStream, token); readBaseForm(tokenStream, token); tokensByStartOffset.put(token.getStartOffset(), token); tokens.add(token); } } tokenStream.end(); tokenStream.close(); return tokens; }