List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:com.bizosys.hsearch.inpipe.ComputeTokens.java
License:Apache License
private void tokenize(Doc doc, TermStream ts) throws SystemFault, ApplicationFault, IOException { if (null == ts) return;/*from www . j a v a2 s . c o m*/ TokenStream stream = ts.stream; if (null == stream) return; DocTerms terms = doc.terms; if (null == doc.terms) { terms = new DocTerms(); doc.terms = terms; } String token = null; int offset = 0; CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class); stream.reset(); while (stream.incrementToken()) { token = termA.toString(); offset = offsetA.startOffset(); Term term = new Term(doc.tenant, token, ts.sighting, ts.type, offset); terms.getTermList().add(term); } stream.close(); }
From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) public final void mapFreeTextBitset(final Field fld, final Context context) throws IOException, InterruptedException { terms.clear();//from w ww.j a v a 2 s. c om CharTermAttribute termAttribute = null; TokenStream stream = null; try { if (isFieldNull) return; Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name); stream = analyzer.tokenStream(fld.name, new StringReader(fldValue)); termAttribute = stream.getAttribute(CharTermAttribute.class); String last2 = null; String last1 = null; while (stream.incrementToken()) { String termWord = termAttribute.toString(); if (0 == termWord.length()) continue; appender.delete(0, appender.capacity()); /** * Row Key is mergeidFIELDwordhashStr */ boolean isEmpty = (null == mergeId) ? true : (mergeId.length() == 0); String rowKeyPrefix = (isEmpty) ? fld.name : mergeId + "_" + fld.name; rowKeyP1 = rowKeyPrefix + termWord; rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq) .toString(); appender.setLength(0); rowVal = appender.append(incrementalIdSeekPosition).toString(); context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal)); if (!fld.isBiWord && !fld.isTriWord) continue; /** * Do Three phrase word */ if (null != last2) { appender.setLength(0); rowKeyP1 = appender.append(rowKeyPrefix).append(last2).append(' ').append(last1).append(' ') .append(termWord).append(' ').append('*').toString(); appender.setLength(0); rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq) .toString(); context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal)); } /** * Do Two phrase word */ if (null != last1) { appender.setLength(0); rowKeyP1 = appender.append(rowKeyPrefix).append(last1).append(' ').append(termWord).append(' ') .append('*').toString(); appender.setLength(0); rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq) .toString(); context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal)); } last2 = last1; last1 = termWord; } } catch (Exception e) { e.printStackTrace(); System.err.println("Error While tokenizing : " + e.getMessage()); } finally { try { if (null != stream) stream.close(); } catch (Exception ex) { IdSearchLog.l.warn("Error during Tokenizer Stream closure"); } } }
From source file:com.bizosys.unstructured.IndexWriter.java
License:Apache License
/** * Find the last offset./*from w w w.ja va 2s .c o m*/ * Find each term offset * * @param stream * @param docId * @param docType * @param fieldType * @param fieldBoost * @param codecs * @param uniqueTokens * @throws IOException */ private final void tokenize(TokenStream stream, int docId, int docType, DocumentMetadata filter, int fieldType, Map<String, IndexRow> uniqueTokens) throws IOException { String token = null; int curoffset = 0; int lastoffset = 0; int position = -1; StringBuilder sb = new StringBuilder(); CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class); while (stream.incrementToken()) { token = termA.toString(); curoffset = offsetA.endOffset(); if (lastoffset != curoffset) position++; lastoffset = curoffset; String key = IndexRow.generateKey(sb, docId, token, docType, fieldType, filter); sb.delete(0, sb.capacity()); if (uniqueTokens.containsKey(key)) { IndexRow existingRow = uniqueTokens.get(key); existingRow.set(curoffset, position); existingRow.occurance++; } else { IndexRow row = new IndexRow(docId, token, docType, fieldType, curoffset, position); if (null != filter) row.docMeta = filter; uniqueTokens.put(key, row); } } stream.end(); stream.close(); for (IndexRow row : uniqueTokens.values()) cachedIndex.add(row); }
From source file:com.chriscx.stem.Stem.java
public String evaluate(BufferedReader input) { if (input == null) { return null; }/*from w w w . j a v a2 s .co m*/ CharArraySet stopWordsSet = new CharArraySet(Version.LUCENE_46, 10000, true); String stopWords = "a afin ai ainsi aprs attendu au aujourd auquel aussi " + "autre autres aux auxquelles auxquels avait avant avec c car ce " + "ceci cela celle celles celui cependant certain certaine certaines " + "certains ces cet cette ceux chez ci combien comme comment " + "concernant contre d dans de debout dedans dehors del depuis " + "derrire des dsormais desquelles desquels devers devra doit " + "donc dont du duquel durant ds elle elles en entre environ est" + " et etc eu eux except hormis hors hlas hui il ils j je jusqu " + "jusque l la laquelle le lequel les lesquelles lesquels leur leurs " + "lorsque lui l ma mais malgr me merci mes mien mienne miennes " + "miens moins mon moyennant mme mmes n ne ni non nos notre nous " + "nanmoins ntre ntres on ont ou outre o par parmi partant pas " + "pass pendant plein plus plusieurs pour pourquoi proche prs " + "puisque qu quand que quel quelle quelles quels qui quoi quoique" + " revoici revoil s sa sans sauf se selon seront ses si sien " + "sienne siennes siens sinon soi soit son sont sous suivant sur " + "ta te tes tien tienne tiennes tiens ton tous tout toute toutes" + " tu un une va vers voici voil vos votre vous vu vtre vtres y " + " t tre "; String[] stopWordsTab = stopWords.split(" "); for (String word : stopWordsTab) { stopWordsSet.add(word); } Analyzer analyzer = new FrenchAnalyzer(Version.LUCENE_46, stopWordsSet); result = ""; try { String line = input.readLine(); line = line.replaceAll("(\\S)+@(\\S)+.(\\S)+", ""); line = line.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", ""); line = line.replaceAll("(_|-)+", ""); line = line.replaceAll("(\\n|\\r|\\t)+", ""); line = line.replaceAll("(?![\\._])\\p{P}", ""); while (line != null) { TokenStream stream = analyzer.tokenStream(null, line); stream.reset(); while (stream.incrementToken()) { String wordset = stream.getAttribute(CharTermAttribute.class).toString(); wordset = wordset.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", ""); result += wordset + " "; } result += "\n"; stream.close(); line = input.readLine(); } input.close(); return result; } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } }
From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java
public void performWork(Path doc) throws IOException { try {/*from w ww . ja v a2 s .c om*/ System.out.println("performing token work"); HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder part = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); System.out.println(key); String value = pair.getSecond().toString(); part.append(key); TokenStream stream = analyzer.tokenStream(key, new StringReader(value)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); stream.close(); tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(vectorsDir, part.toString()); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); System.out.println("wrote"); } } catch (Exception e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:com.flaptor.hounder.classifier.util.DocumentParser.java
License:Apache License
/** * Transforms the document in an array of tokens and counts the number of * ocurrencies of each token./* ww w . jav a 2 s .c o m*/ * @param doc the document represented as a string * @param maxTuple If maxTuple>1 then tuples of 1..maxTuples will be return. * Ie if the document is "t1 t2 t3 t4" and maxTuple=2, then the returned * map will contain values for each fo the following: t1, t2, t1_t2, t2_t3 * If maxTuple <1 then maxTuple=1. * @return a map that binds each token with the count of ocurrencies within * the document * @see {@link TupleTokenizer}{@link #parse(String, int)} * The map should be '<String,int>'. But int can't be inserted to a Map, and * Integer is unmodifiable. So this awful hack uses an int[] to be able to * add an int and change it's value easily during the calculation. */ public static Map<String, int[]> parse(String doc, int maxTuple) { // TODO: Use Integer instead int[]. Map<String, int[]> tokenCount = new HashMap<String, int[]>(); // TODO: Decouple from lucene, allow the analyzer to be configurable. // TODO: Verifiy that it is necessary to create a new analyzer instance each time. Analyzer analyzer = new StandardAnalyzer(); Reader docReader = new StringReader(doc); TokenStream tokenStream = analyzer.tokenStream(null, docReader); try { if (1 < maxTuple) { tokenStream = new TupleTokenizer(tokenStream, maxTuple); } Token token = new Token(); while ((token = tokenStream.next(token)) != null) { String term = TokenUtil.termText(token); int[] count = tokenCount.get(term); if (count == null) { count = new int[] { 0 }; tokenCount.put(term, count); } else { count[0]++; } } } catch (IOException e) { System.err.println("parse: couldn't parse document " + e); } finally { try { tokenStream.close(); } catch (IOException e) { System.err.println("close: " + e); } } return tokenCount; }
From source file:com.flaptor.indextank.query.IndexEngineParser.java
License:Apache License
public Iterator<AToken> parseDocumentField(String fieldName, String content) { final TokenStream tkstream = analyzer.tokenStream(fieldName, new StringReader(content)); final TermAttribute termAtt = tkstream.addAttribute(TermAttribute.class); final PositionIncrementAttribute posIncrAttribute = tkstream.addAttribute(PositionIncrementAttribute.class); final OffsetAttribute offsetAtt = tkstream.addAttribute(OffsetAttribute.class); return new AbstractIterator<AToken>() { int currentPosition = 0; @Override// w ww. ja v a2s. c o m protected AToken computeNext() { try { if (!tkstream.incrementToken()) { tkstream.end(); tkstream.close(); return endOfData(); } } catch (IOException e) { //This should never happen, as the reader is a StringReader } //final org.apache.lucene.analysis.Token luceneTk = tkstream.getAttribute(org.apache.lucene.analysis.Token.class); currentPosition += posIncrAttribute.getPositionIncrement(); final int position = currentPosition; final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); final String text = termAtt.term(); return new AToken() { @Override public String getText() { return text; //luceneTk.term(); } @Override public int getPosition() { return position; //luceneTk.getPositionIncrement(); } @Override public int getStartOffset() { return startOffset; } @Override public int getEndOffset() { return endOffset; } }; } }; }
From source file:com.fujitsu.ca.fic.caissepop.evaluation.TokenizeText.java
License:Apache License
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) { return null; }/*from w w w .j a va 2s . co m*/ DataBag bagOfTokens = bagFactory.newDefaultBag(); TokenStream tokenStream = null; try { String lineOfText = input.get(0).toString(); StringReader textInput = new StringReader(lineOfText); tokenStream = analyzer.tokenStream(noField, textInput); CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { Tuple termText = tupleFactory.newTuple(termAttribute.toString()); bagOfTokens.add(termText); termAttribute.setEmpty(); } } finally { if (tokenStream != null) { tokenStream.close(); } } return bagOfTokens; }
From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
private String analyzeQuery(String query, Analyzer analyzer) { if (analyzer != null && query != null && query.length() > 0) { TokenStream tokenStream = analyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(query)); StringBuilder newQueryB = new StringBuilder(); try {/*from w w w .j a v a 2s . c o m*/ tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); // OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); // TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); newQueryB.append(term.toString()); newQueryB.append(' '); } tokenStream.end(); return newQueryB.toString().trim(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } finally { try { tokenStream.close(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } } } return query; }
From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java
License:Apache License
/** * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query. * //from w ww . j av a2 s . c o m * @param synonymAnalyzer * @param solrParams * @return */ private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) { String origQuery = getQueryStringFromParser(); int queryLen = origQuery.length(); // TODO: make the token stream reusable? TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(origQuery)); SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create(); boolean constructPhraseQueries = solrParams.getBool(Params.SYNONYMS_CONSTRUCT_PHRASES, false); boolean bag = solrParams.getBool(Params.SYNONYMS_BAG, false); List<String> synonymBag = new ArrayList<>(); try { tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class); if (!typeAttribute.type().equals("shingle")) { // ignore shingles; we only care about synonyms and the original text // TODO: filter other types as well String termToAdd = term.toString(); if (typeAttribute.type().equals("SYNONYM")) { synonymBag.add(termToAdd); } // Don't quote sibgle term term synonyms if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM") && termToAdd.contains(" ")) { // Don't Quote when original is already surrounded by quotes if (offsetAttribute.startOffset() == 0 || offsetAttribute.endOffset() == queryLen || origQuery.charAt(offsetAttribute.startOffset() - 1) != '"' || origQuery.charAt(offsetAttribute.endOffset()) != '"') { // make a phrase out of the synonym termToAdd = new StringBuilder(termToAdd).insert(0, '"').append('"').toString(); } } if (!bag) { // create a graph of all possible synonym combinations, // e.g. dog bite, hound bite, dog nibble, hound nibble, etc. TextInQuery textInQuery = new TextInQuery(termToAdd, offsetAttribute.startOffset(), offsetAttribute.endOffset()); startPosToTextsInQuery.put(offsetAttribute.startOffset(), textInQuery); } } } tokenStream.end(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } finally { try { tokenStream.close(); } catch (IOException e) { throw new RuntimeException("uncaught exception in synonym processing", e); } } List<String> alternateQueries = synonymBag; if (!bag) { // use a graph rather than a bag List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<>(startPosToTextsInQuery.values().size()); sortedTextsInQuery.addAll(startPosToTextsInQuery.asMap().values().stream().map(ArrayList::new) .collect(Collectors.toList())); // have to use the start positions and end positions to figure out all possible combinations alternateQueries = buildUpAlternateQueries(solrParams, sortedTextsInQuery); } // save for debugging purposes expandedSynonyms = alternateQueries; return createSynonymQueries(solrParams, alternateQueries); }