List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:lucli.LuceneMethods.java
License:Apache License
private void invertDocument(Document doc) throws IOException { Map tokenMap = new HashMap(); final int maxFieldLength = 10000; Analyzer analyzer = createAnalyzer(); Iterator fields = doc.getFields().iterator(); final Token reusableToken = new Token(); while (fields.hasNext()) { Field field = (Field) fields.next(); String fieldName = field.name(); if (field.isIndexed()) { if (field.isTokenized()) { // un-tokenized field Reader reader; // find or make Reader if (field.readerValue() != null) reader = field.readerValue(); else if (field.stringValue() != null) reader = new StringReader(field.stringValue()); else throw new IllegalArgumentException("field must have either String or Reader value"); int position = 0; // Tokenize field and add to postingTable TokenStream stream = analyzer.tokenStream(fieldName, reader); TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream .addAttribute(PositionIncrementAttribute.class); try { while (stream.incrementToken()) { position += (posIncrAtt.getPositionIncrement() - 1); position++;/*from w ww.ja va2 s .c om*/ String name = termAtt.term(); Integer Count = (Integer) tokenMap.get(name); if (Count == null) { // not in there yet tokenMap.put(name, new Integer(1)); //first one } else { int count = Count.intValue(); tokenMap.put(name, new Integer(count + 1)); } if (position > maxFieldLength) break; } } finally { stream.close(); } } } } Entry[] sortedHash = getSortedMapEntries(tokenMap); for (int ii = 0; ii < sortedHash.length && ii < 10; ii++) { Entry currentEntry = sortedHash[ii]; message((ii + 1) + ":" + currentEntry.getKey() + " " + currentEntry.getValue()); } }
From source file:lux.search.highlight.XmlHighlighter.java
License:Mozilla Public License
/** * inspired by org.apache.lucene.search.highlight.Highlighter * * //from w w w .ja v a 2 s .co m * send highlighted events to the writer * @param reader the input document stream * @param characterOffset beginning of the text to highlight * @param textLength length of the text to highlight * @throws XMLStreamException */ private void highlightTextNode() throws IOException, XMLStreamException { TokenStream tokenStream = analyzer.tokenStream(textFieldName, textReader); xmlStreamTokens.reset(tokenStream); lastEndOffset = 0; for (boolean next = xmlStreamTokens.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = xmlStreamTokens.incrementToken()) { if (scorerTokens != null && xmlStreamTokens.isPlainToken()) { scorerTokens.incrementToken(); } if (tokenGroup.isDistinct()) { // write out any accumulated tokens handleTokenGroup(); tokenGroup.clear(); } if (scorerTokens == null || xmlStreamTokens.isPlainToken()) { tokenGroup.addToken(scorer.getTokenScore()); } } handleTokenGroup(); tokenGroup.clear(); writeTrailingText(); tokenStream.end(); tokenStream.close(); }
From source file:net.mad.ads.server.utils.http.KeywordUtils.java
License:Open Source License
public static List<String> getTokens(String queryString) { try {//from ww w . j av a 2 s. co m GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_33); TokenStream ts = a.tokenStream("", new StringReader(queryString)); List<String> tokens = new ArrayList<String>(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String token = termAtt.toString(); tokens.add(token); } ts.end(); ts.close(); return tokens; } catch (IOException e) { logger.error("", e); } return null; }
From source file:net.sf.logsaw.index.internal.LuceneIndexServiceImpl.java
License:Open Source License
private void fillPhraseQuery(PhraseQuery phrase, Analyzer analyzer, String fld, String val) throws IOException { TokenStream ts = analyzer.tokenStream(fld, new StringReader(val)); try {//from w ww . j a va 2s .c o m ts.reset(); // Iterate over tokens and treat each token as term int pos = 0; while (ts.incrementToken()) { CharTermAttribute t = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute p = ts.getAttribute(PositionIncrementAttribute.class); pos += p.getPositionIncrement(); phrase.add(new Term(fld, t.toString()), pos - 1); } // End-of-stream clean-up ts.end(); } finally { ts.close(); } }
From source file:net.sf.okapi.lib.tmdb.lucene.Seeker.java
License:Open Source License
public List<TmHit> searchFuzzy(String genericText, String codesAsString, String tmId, String locale, int max, int threshold, HashMap<String, String> attributes) { float searchThreshold = (float) threshold; if (threshold < 0) searchThreshold = 0.0f;//from www . ja va 2 s. c o m if (threshold > 100) searchThreshold = 100.0f; String queryText = genericText; String gtextFName = TmEntry.GTEXT_PREFIX + locale; Locale javaLoc = new Locale(locale); // create basic ngram analyzer to tokenize query TokenStream queryTokenStream; if (javaLoc.getLanguage() == Locale.ENGLISH.getLanguage()) { queryTokenStream = defaultFuzzyAnalyzer.tokenStream(gtextFName, new StringReader(queryText)); } else { queryTokenStream = new NgramAnalyzer(javaLoc, 4).tokenStream(gtextFName, new StringReader(queryText)); } // Get the TermAttribute from the TokenStream CharTermAttribute termAtt = (CharTermAttribute) queryTokenStream.addAttribute(CharTermAttribute.class); TmFuzzyQuery fQuery = new TmFuzzyQuery(searchThreshold, gtextFName); try { queryTokenStream.reset(); while (queryTokenStream.incrementToken()) { //Term t = new Term(keyIndexField, new String(termAtt.buffer())); Term t = new Term(gtextFName, termAtt.toString()); fQuery.add(t); } queryTokenStream.end(); queryTokenStream.close(); } catch (IOException e) { throw new OkapiIOException(e.getMessage(), e); } return getFuzzyHits(fQuery, genericText, codesAsString, tmId, locale, max, searchThreshold, attributes); }
From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java
License:Apache License
@Override public String[] getQueryTokens(final String queryString) { TokenStream tokenStream = null; try {/*from w w w . jav a 2 s.c o m*/ tokenStream = getDefaultAnalyzer().tokenStream("QUERY_TOKENS", new StringReader(queryString)); tokenStream.reset(); final ArrayList<String> al = new ArrayList<>(); while (tokenStream.incrementToken()) { final String term = tokenStream.getAttribute(CharTermAttribute.class).toString(); if (term != null && term.length() > 1) { al.add(term); } } if (al.size() == 0) { al.add(queryString); } return al.toArray(new String[al.size()]); } catch (final IOException e) { throw ADOException.of(e); } finally { if (tokenStream != null) { try { tokenStream.close(); } catch (final IOException e) { } } } }
From source file:net.skyatlas.icd.dao.daoImpl.AnsjAnalysisTest.java
@Test public void test() throws IOException { Token nt = new Token(); Analyzer ca = new AnsjAnalysis(); Reader sentence = new StringReader( "\n\n\n\n\n\n\n????, ????????????????????????????" + "???????????????????" + "??????????? ??????????????2????" + "" + "? ????????????? ??? ????????"); TokenStream ts = ca.tokenStream("sentence", sentence); System.out.println("start: " + (new Date())); long before = System.currentTimeMillis(); while (ts.incrementToken()) { System.out.println(ts.getAttribute(CharTermAttribute.class)); }//from ww w. j a v a 2 s . co m ts.close(); long now = System.currentTimeMillis(); System.out.println("time: " + (now - before) / 1000.0 + " s"); }
From source file:net.skyatlas.icd.test.AnsegTest.java
static public void main(String[] args) throws IOException, CorruptIndexException, ParseException, InvalidTokenOffsetsException { AnsegTest inst = new AnsegTest(); Token nt = new Token(); Analyzer ca = new AnsjAnalysis(); Reader sentence = new StringReader( "\n\n\n\n\n\n\n????, ????????????????????????????" + "???????????????????" + "??????????? ??????????????2????" + "" + "? ????????????? ??? ????????"); TokenStream ts = ca.tokenStream("sentence", sentence); System.out.println("start: " + (new Date())); long before = System.currentTimeMillis(); while (ts.incrementToken()) { System.out.println(ts.getAttribute(CharTermAttribute.class)); }/*from w w w . j a va2s. c o m*/ ts.close(); long now = System.currentTimeMillis(); System.out.println("time: " + (now - before) / 1000.0 + " s"); HashSet<String> hs = new HashSet<String>(); BufferedReader reader2 = IOUtil.getReader(ResourceBundle.getBundle("library").getString("stopLibrary"), "UTF-8"); String word = null; while ((word = reader2.readLine()) != null) { hs.add(word); } Analyzer analyzer = new AnsjAnalysis(hs, false); Directory directory = null; IndexWriter iwriter = null; BufferedReader reader = IOUtil.getReader("/Users/changzhenghe/Downloads/hy_statspack01.txt", "UTF-8"); String temp = null; StringBuilder sb = new StringBuilder(); while ((temp = reader.readLine()) != null) { sb.append(temp); sb.append("\n"); } reader.close(); String text = sb.toString(); text = "???????????? ??? ????????"; IndexWriterConfig ic = new IndexWriterConfig(Version.LUCENE_32, analyzer); // directory = new RAMDirectory(); iwriter = new IndexWriter(directory, ic); // BufferedReader reader = // IOUtil.getReader("/Users/ansj/Documents//?//1998?_.txt", // "GBK"); // String temp = null; // while ((temp = reader.readLine()) != null) { // addContent(iwriter, temp); // } inst.addContent(iwriter, "? ?() (?)"); inst.addContent(iwriter, " ?() (?)"); inst.addContent(iwriter, "? ? (?)"); inst.addContent(iwriter, " ??NEC "); inst.addContent(iwriter, "?"); iwriter.commit(); iwriter.close(); System.out.println(""); inst.search(analyzer, directory, ""); inst.search(analyzer, directory, ""); inst.search(analyzer, directory, ""); inst.search(analyzer, directory, "?"); /* KeyWordComputer kwc = new KeyWordComputer(5); String title = "??"; String content = "9??" + "?????????" + "????" + "??" + "?????" + "???" + "??????" + "???" + "????20??" + "????" + "?" + "???]??" + "???"; Collection<Keyword> result = kwc.computeArticleTfidf(title, content); System.out.println(result); AnsegTest t = new AnsegTest(); List<Term> parse = ToAnalysis.parse("?"); System.out.println(parse); System.out.println("*********** ? ************"); // UserDefineLibrary.insertWord("", "userDefine", 1000); // UserDefineLibrary.insertWord("?", "userDefine", 1000); UserDefineLibrary.insertWord("?", "userDefine", 1000); parse = ToAnalysis.parse("???"); System.out.println(parse); */ }
From source file:net.strong.weblucene.search.WebLuceneHighlighter.java
License:Apache License
/** * Return highlighted string//from www . j a v a2 s. c o m * * @param srcString source string need to highlight * * @return highlighted string */ public String highlight(String srcString) { if ((srcString == null) || srcString.trim().equals("")) { return ""; } int srcLength = srcString.length(); //truncate src to maxRetrunSize if (srcLength >= maxBufferSize) { srcString = srcString.substring(0, maxBufferSize); srcLength = maxBufferSize; } //return src if no term to highlight if (terms.size() == 0) { return srcString.substring(0, maxReturnSize); } try { //reset buffer and last term offset //default previous token end place int prevEnd = 0; srcBuffer = new char[srcLength]; StringReader stringReader = new StringReader(srcString); stringReader.read(srcBuffer); StringReader sr = new StringReader(srcString); TokenStream tokenStream = analyzer.tokenStream(null, sr); //return string buffer StringBuffer returnBuffer = new StringBuffer(); String preContextBlock = ""; //previous text block //highlight: [preContextBlock] + <b> + [token] + </b> for (Token t = tokenStream.next(); t != null; t = tokenStream.next()) { preContextBlock = getContext(prevEnd, t.startOffset()); returnBuffer.append(preContextBlock); //append highlight string returnBuffer.append(highlightPrefix); for (int i = t.startOffset(); i < t.endOffset(); i++) { returnBuffer.append(srcBuffer[i]); } returnBuffer.append(highlightSuffix); //record current offset prevEnd = t.endOffset(); if (returnBuffer.length() > maxReturnSize) { break; } } tokenStream.close(); //no highlight token find, return first maxReturnSize of string[] if (returnBuffer.length() == 0) { if (srcLength > maxReturnSize) { returnBuffer.append(srcBuffer, 0, maxReturnSize); } else { returnBuffer.append(srcBuffer, 0, srcLength); } return returnBuffer.toString(); } //expand return string to MaxReturn while ((returnBuffer.length() < maxReturnSize) && (prevEnd < srcLength)) { returnBuffer.append(srcBuffer[prevEnd]); prevEnd++; } return returnBuffer.toString(); } catch (Exception e) { e.printStackTrace(); //return with original value return ""; } }
From source file:NewsIR_search.TRECQuery.java
/** * Returns the content of the 'queryField' from the query text * @param analyzer/*from w w w. j a va 2 s .com*/ * @param queryField * @return (String) The content of the field * @throws Exception */ public String queryFieldAnalyze(Analyzer analyzer, String queryField) throws Exception { StringBuffer buff = new StringBuffer(); TokenStream stream = analyzer.tokenStream(CumulativeIndexer.FIELD_TEXT, new StringReader(queryField)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); buff.append(term).append(" "); } stream.end(); stream.close(); return buff.toString(); }