List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:ucas.IKAnalzyerDemo.java
License:Apache License
public static String Spilt2Words(String content) { String resString = ""; //IK?smart??/*from w w w . ja v a 2s . c om*/ Analyzer analyzer = new IKAnalyzer(true); //?LuceneTokenStream TokenStream ts = null; try { //myfield?? ts = analyzer.tokenStream("myfield", new StringReader(content)); //?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); //?TokenStream?StringReader ts.reset(); //?? while (ts.incrementToken()) { resString += term.toString() + "|"; } //TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); } finally { //TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } return resString; }
From source file:uk.co.flax.luwak.analysis.TestSuffixingNGramTokenizer.java
License:Apache License
public static void main(String... args) throws IOException { String text = Files.toString(new File("src/test/resources/gutenberg/README"), Charsets.UTF_8); DocumentBatch batch = DocumentBatch/*www .j a v a 2 s . co m*/ .of(InputDocument.builder("1").addField("f", text, new StandardAnalyzer()).build()); for (int i = 0; i < 50; i++) { long time = System.currentTimeMillis(); // Cannot use try-with-resources here as we assign to ts in the block. LeafReader reader = batch.getIndexReader(); TokenStream ts = new TermsEnumTokenStream(reader.fields().terms("f").iterator()); try { ts = new SuffixingNGramTokenFilter(ts, "XX", "__WILDCARD__", 20); //ts = new DuplicateRemovalTokenFilter(ts); int tokencount = 0; ts.reset(); while (ts.incrementToken()) { tokencount++; } System.out.println(tokencount + " tokens in " + (System.currentTimeMillis() - time) + " ms"); } finally { ts.close(); } } }
From source file:uk.co.nickthecoder.pinkwino.metadata.LuceneMetaData.java
License:Open Source License
public String analyzeWord(String word) { Reader reader = new StringReader(word); TokenStream tokenStream = null; try {//from w ww . j ava2 s . c o m tokenStream = _analyzer.tokenStream("content", reader); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { return charTermAttribute.toString(); } } catch (Exception e) { _logger.error("Failed to filter a keyword. " + e); } finally { try { if (tokenStream != null) { tokenStream.end(); tokenStream.close(); } reader.close(); } catch (Exception e) { // Do nothing _logger.error("Failed to close during analyzeWord " + e); } } return null; }
From source file:uk.gov.nationalarchives.discovery.taxonomy.common.repository.lucene.analyzer.TaxonomyGeneralAnalyzerTest.java
License:Mozilla Public License
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts, boolean offsetsAreCorrect) throws IOException { assertNotNull(output);// w ww .ja v a 2 s . co m CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class); CharTermAttribute termAtt = null; if (output.length > 0) { assertTrue("has no CharTermAttribute", ts.hasAttribute(CharTermAttribute.class)); termAtt = ts.getAttribute(CharTermAttribute.class); } OffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class)); offsetAtt = ts.getAttribute(OffsetAttribute.class); } TypeAttribute typeAtt = null; if (types != null) { assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class)); typeAtt = ts.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (posIncrements != null || finalPosInc != null) { assertTrue("has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); } PositionLengthAttribute posLengthAtt = null; if (posLengths != null) { assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class)); posLengthAtt = ts.getAttribute(PositionLengthAttribute.class); } KeywordAttribute keywordAtt = null; if (keywordAtts != null) { assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class)); keywordAtt = ts.getAttribute(KeywordAttribute.class); } // Maps position to the start/end offset: final Map<Integer, Integer> posToStartOffset = new HashMap<>(); final Map<Integer, Integer> posToEndOffset = new HashMap<>(); ts.reset(); int pos = -1; int lastStartOffset = 0; for (int i = 0; i < output.length; i++) { // extra safety to enforce, that the state is not preserved and also // assign bogus values ts.clearAttributes(); termAtt.setEmpty().append("bogusTerm"); if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); if (keywordAtt != null) keywordAtt.setKeyword((i & 1) == 0); checkClearAtt.getAndResetClearCalled(); // reset it, because we // called clearAttribute() // before assertTrue("token " + i + " does not exist", ts.incrementToken()); assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled()); assertEquals("term " + i, output[i], termAtt.toString()); if (startOffsets != null) { assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset()); } if (endOffsets != null) { assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset()); } if (types != null) { assertEquals("type " + i, types[i], typeAtt.type()); } if (posIncrements != null) { assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement()); } if (posLengths != null) { assertEquals("posLength " + i, posLengths[i], posLengthAtt.getPositionLength()); } if (keywordAtts != null) { assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword()); } // we can enforce some basic things about a few attributes even if // the caller doesn't check: if (offsetAtt != null) { final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); if (finalOffset != null) { assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue()); assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue(), endOffset <= finalOffset.intValue()); } if (offsetsAreCorrect) { assertTrue("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset, offsetAtt.startOffset() >= lastStartOffset); lastStartOffset = offsetAtt.startOffset(); } if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) { // Validate offset consistency in the graph, ie // all tokens leaving from a certain pos have the // same startOffset, and all tokens arriving to a // certain pos have the same endOffset: final int posInc = posIncrAtt.getPositionIncrement(); pos += posInc; final int posLength = posLengthAtt.getPositionLength(); if (!posToStartOffset.containsKey(pos)) { // First time we've seen a token leaving from this // position: posToStartOffset.put(pos, startOffset); // System.out.println(" + s " + pos + " -> " + // startOffset); } else { // We've seen a token leaving from this position // before; verify the startOffset is the same: // System.out.println(" + vs " + pos + " -> " + // startOffset); assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToStartOffset.get(pos).intValue(), startOffset); } final int endPos = pos + posLength; if (!posToEndOffset.containsKey(endPos)) { // First time we've seen a token arriving to this // position: posToEndOffset.put(endPos, endOffset); // System.out.println(" + e " + endPos + " -> " + // endOffset); } else { // We've seen a token arriving to this position // before; verify the endOffset is the same: // System.out.println(" + ve " + endPos + " -> " + // endOffset); assertEquals("pos=" + pos + " posLen=" + posLength + " token=" + termAtt, posToEndOffset.get(endPos).intValue(), endOffset); } } } if (posIncrAtt != null) { if (i == 0) { assertTrue("first posIncrement must be >= 1", posIncrAtt.getPositionIncrement() >= 1); } else { assertTrue("posIncrement must be >= 0", posIncrAtt.getPositionIncrement() >= 0); } } if (posLengthAtt != null) { assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1); } } if (ts.incrementToken()) { fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt.toString()); } // repeat our extra safety checks for end() ts.clearAttributes(); if (termAtt != null) termAtt.setEmpty().append("bogusTerm"); if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653); checkClearAtt.getAndResetClearCalled(); // reset it, because we called // clearAttribute() before ts.end(); assertTrue("super.end()/clearAttributes() was not called correctly in end()", checkClearAtt.getAndResetClearCalled()); if (finalOffset != null) { assertEquals("finalOffset", finalOffset.intValue(), offsetAtt.endOffset()); } if (offsetAtt != null) { assertTrue("finalOffset must be >= 0", offsetAtt.endOffset() >= 0); } if (finalPosInc != null) { assertEquals("finalPosInc", finalPosInc.intValue(), posIncrAtt.getPositionIncrement()); } ts.close(); }
From source file:uoc.dedup.document.fingerprintCharikar.java
License:Open Source License
/** * Calculate the fingerprint./*from ww w . j a v a2 s . c o m*/ * Splt the text in shingles and with each token generate a hashrabin, with the result * we generate a final fingerprint vector. * @return fingerprint in a string */ public String calculateFingerprint() { totalTokens = 0; totalnGramTokens = 0; TokenStream tk = null; if (this.useStemming()) { this.analyzer = analyzerCache.newAnalyzer(this.language); tk = this.analyzer.tokenStream("fingerprint", reader); } else { tk = new StandardTokenizer(reader); } ShingleMatrixFilter tokens = new ShingleMatrixFilter(tk, 1, this.getMAXGRAMS(), new Character(' ')); //Put the tokens in a map and select the most important terms. try { while (true) { Token token = tokens.next(); if (token == null) { break; } int numtokens = token.term().split(" ").length; if (numtokens == 1) { this.add(token.term(), this.m); //Add a token to the list of frequencies tokens //System.out.println(token.term()); totalTokens++; } else if (numtokens >= this.MIMGRAMS) { //System.out.println(token.term()); this.add(token.term(), this.nGrams); totalnGramTokens++; //Count the ngram tokens } } tokens.close(); this.createTopTerms(this.m, this.getTokensTop(), this.totalTokens); //Calculate the fingerprint vector this.calculateVectorFingerprint(this.nGrams, this.totalnGramTokens); tk.close(); } catch (IOException e) { System.out.println("Error getTokens: " + e.getMessage()); } vFingerprint = this.simHash.getFingerprint(); this.fingerprint2String(); return this.getFingerprint(); }
From source file:varaha.text.TokenizeText.java
License:Apache License
/** Fills a DataBag with tokens from a TokenStream *//*from ww w. j a va 2 s. c om*/ public DataBag fillBag(TokenStream stream) throws IOException { DataBag result = bagFactory.newDefaultBag(); CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class); try { stream.reset(); while (stream.incrementToken()) { if (termAttribute.length() > 0) { Tuple termText = tupleFactory.newTuple(termAttribute.toString()); result.add(termText); } } stream.end(); } finally { stream.close(); } return result; }
From source file:webdocs.WebDocAnalyzer.java
String preprocessText(String html, boolean title) throws IOException { int freqCutoffThreshold = title ? 1 : this.freqCutoffThreshold; HashMap<String, Integer> tfMap = new HashMap<>(); StringBuffer buff = new StringBuffer(); CharArraySet stopList = StopFilter.makeStopSet(Version.LUCENE_4_9, indexer.buildStopwordList("stopfile")); Analyzer webdocAnalyzer = new WebDocAnalyzer(indexer.getProperties(), stopList); TokenStream stream = webdocAnalyzer.tokenStream("field", new StringReader(html)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();//from w w w .ja v a2 s. c om while (stream.incrementToken()) { String token = termAtt.toString(); Integer tf = tfMap.get(token); if (tf == null) { tf = new Integer(0); } tf++; tfMap.put(token, tf); } stream.end(); stream.close(); for (Map.Entry<String, Integer> e : tfMap.entrySet()) { String word = e.getKey(); int tf = e.getValue(); if (tf >= freqCutoffThreshold) { for (int i = 0; i < tf; i++) { // print this word tf times... word order doesn't matter! buff.append(word).append(" "); } } } return buff.toString(); }
From source file:workTextIndexService.Procesamiento.java
public String normalizar(String texto) { resultN = ""; @SuppressWarnings("deprecation") SpanishAnalyzer analyzer = new SpanishAnalyzer(Version.LUCENE_4_10_1); try {/*from www. ja v a 2 s .com*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(texto)); stream.reset(); while (stream.incrementToken()) { resultN = resultN + (stream.getAttribute(CharTermAttribute.class).toString()) + " "; } stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return resultN.toLowerCase(); }
From source file:wt10g.WTDocument.java
String preProcess(String text) throws Exception { StringBuffer tokenizedContentBuff = new StringBuffer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();/* w w w .j a v a 2 s. c o m*/ while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); tokenizedContentBuff.append(term).append(" "); } stream.end(); stream.close(); return tokenizedContentBuff.toString(); }
From source file:yasoco.TermScore.java
Query constructQuery(int docId) throws Exception { Query q = null;//from w w w .j av a 2 s . c o m boolean formSelectiveQueries = Boolean.parseBoolean(prop.getProperty("toptermquery", "true")); /* MoreLikeThis not woking for some reason! if (formSelectiveQueries) { q = mlt.like(docId); return q; } */ Document queryDoc = reader.document(docId); q = new BooleanQuery(); int termCount = 0; TokenStream fs = null; List<IndexableField> fields = queryDoc.getFields(); for (IndexableField field : fields) { String fieldName = field.name(); if (fieldName.equals(JavaSCTree.FIELD_DOCNAME) || fieldName.equals(JavaSCTree.FIELD_SC)) continue; // ignore non-searchable fields if (formSelectiveQueries) { List<TermScore> topList = selTerms(docId, field.name(), q); for (TermScore ts : topList) { Term thisTerm = new Term(field.name(), ts.term); ((BooleanQuery) q).add(new TermQuery(thisTerm), BooleanClause.Occur.SHOULD); } } else { fs = queryDoc.getField(fieldName).tokenStream(analyzer); CharTermAttribute termAtt = fs.addAttribute(CharTermAttribute.class); fs.reset(); // print all tokens until stream is exhausted while (fs.incrementToken()) { Term thisTerm = new Term(field.name(), termAtt.toString()); termCount++; if (termCount == maxlimit) { maxlimit = maxlimit << 1; BooleanQuery.setMaxClauseCount(maxlimit); } ((BooleanQuery) q).add(new TermQuery(thisTerm), BooleanClause.Occur.SHOULD); } fs.end(); fs.close(); } } return q; }