List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:com.aliasi.lingmed.medline.SearchableMedlineCodec.java
License:Lingpipe license
public static void main(String[] args) throws Exception { org.apache.lucene.store.RAMDirectory directory = new org.apache.lucene.store.RAMDirectory(); // org.apache.lucene.analysis.SimpleAnalyzer analyzer // = new org.apache.lucene.analysis.SimpleAnalyzer(); // org.apache.lucene.analysis.KeywordAnalyzer analyzer // = new org.apache.lucene.analysis.KeywordAnalyzer(); MedlineCodec codec = new MedlineCodec(); Analyzer analyzer = codec.getAnalyzer(); org.apache.lucene.index.IndexWriterConfig iwConf = new org.apache.lucene.index.IndexWriterConfig( org.apache.lucene.util.Version.LUCENE_36, analyzer); iwConf.setOpenMode(org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND); org.apache.lucene.index.IndexWriter indexWriter = new org.apache.lucene.index.IndexWriter(directory, iwConf);/*w ww. j a va 2 s.co m*/ Document doc = new Document(); doc.add(new Field(Fields.MESH_MINOR_FIELD, "abc", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field(Fields.MESH_MINOR_FIELD, " xyz efg", Field.Store.NO, Field.Index.ANALYZED)); indexWriter.addDocument(doc); indexWriter.close(); org.apache.lucene.index.IndexReader reader = org.apache.lucene.index.IndexReader.open(directory); org.apache.lucene.search.IndexSearcher searcher = new org.apache.lucene.search.IndexSearcher(reader); org.apache.lucene.queryParser.QueryParser qp = new org.apache.lucene.queryParser.QueryParser( org.apache.lucene.util.Version.LUCENE_36, "foo", analyzer); org.apache.lucene.search.Query query = qp.parse(Fields.MESH_MINOR_FIELD + ":efg"); org.apache.lucene.search.TopDocs hits = searcher.search(query, 1000); System.out.println("hits.length()=" + hits.scoreDocs.length); org.apache.lucene.analysis.TokenStream ts = analyzer.tokenStream(Fields.MESH_MINOR_FIELD, new java.io.StringReader("abc xyz efg")); org.apache.lucene.analysis.tokenattributes.CharTermAttribute terms = ts .addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class); org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsets = ts .addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class); org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute positions = ts .addAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class); while (ts.incrementToken()) { int increment = positions.getPositionIncrement(); int start = offsets.startOffset(); int end = offsets.endOffset(); String term = terms.toString(); System.out.println("token=|" + term + "|" + " startOffset=" + start + " endOffset=" + end + " positionIncr=" + increment); } }
From source file:com.antsdb.saltedfish.sql.vdm.LuceneUtil.java
License:Open Source License
static void tokenize(String text, BiConsumer<String, String> lambda) { try (StandardAnalyzer analyzer = new StandardAnalyzer()) { TokenStream stream = analyzer.tokenStream("", text); CharTermAttribute term = stream.getAttribute(CharTermAttribute.class); TypeAttribute type = stream.getAttribute(TypeAttribute.class); stream.reset();//from w w w .ja v a 2 s .c om while (stream.incrementToken()) { lambda.accept(type.type(), term.toString()); } } catch (IOException x) { throw new RuntimeException(x); } }
From source file:com.b2international.index.compat.Highlighting.java
License:Apache License
/** * Splits a string to a list of tokens using the specified Lucene analyzer. * // ww w . java 2 s .c om * @param analyzer the analyzer determining token boundaries (may not be {@code null}) * @param s the string to split * @return a list of tokens, or an empty list if {@code s} is {@code null} or empty */ public static List<String> split(Analyzer analyzer, final String s) { checkNotNull(analyzer, "analyzer"); if (Strings.isNullOrEmpty(s)) { return ImmutableList.of(); } final List<String> tokens = Lists.newArrayList(); TokenStream stream = null; try { stream = analyzer.tokenStream(null, new StringReader(s)); stream.reset(); while (stream.incrementToken()) { tokens.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (final IOException ignored) { // Should not be thrown when using a string reader } finally { endAndCloseQuietly(stream); } return tokens; }
From source file:com.basistech.elasticsearch.index.analysis.rosette.SimpleRosetteAnalysisTests.java
License:Open Source License
public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset();//from w ww . j a va2s . co m CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); Assert.assertNotNull(termAttr); int i = 0; while (stream.incrementToken()) { String s = termAttr.toString(); //out.printf("Output Token %2d: %s%n", i, s); Assert.assertTrue(i < expected.length, "got extra term: " + s); Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i); i++; } Assert.assertEquals(i, expected.length, "not all tokens produced"); }
From source file:com.basistech.IndexFiles.java
License:Open Source License
private void iterateOverFiles(File directory) throws IOException { File[] textFiles = directory.listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { return name.endsWith(".txt"); }// w w w . java 2s . c om }); for (File dataFile : textFiles) { Reader dataReader = null; try { dataReader = Files.newReader(dataFile, Charsets.UTF_8); TokenStream tokenStream = analyzer.tokenStream("full_text", dataReader); tokenStream.reset(); OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class); while (tokenStream.incrementToken()) { offsets.startOffset(); } } finally { IOUtils.closeQuietly(dataReader); } } }
From source file:com.bigdata.search.AbstractSearchTest.java
License:Open Source License
protected String getTokenStream(Analyzer a, String text) throws IOException { StringBuffer sb = new StringBuffer(); TokenStream s = a.tokenStream(null, new StringReader(text)); while (s.incrementToken()) { final TermAttribute term = s.getAttribute(TermAttribute.class); if (sb.length() != 0) { sb.append(" "); }//w ww . ja va2s. c o m sb.append(term.term()); } return sb.toString(); }
From source file:com.bigdata.search.AbstractSearchTest.java
License:Open Source License
private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException { TokenStream s = a.tokenStream(null, new StringReader(text)); int ix = 0;/*from w w w.j a v a2 s . c o m*/ while (s.incrementToken()) { final TermAttribute term = s.getAttribute(TermAttribute.class); final String word = term.term(); assertTrue(ix < expected.length); assertEquals(expected[ix++], word); } assertEquals(ix, expected.length); }
From source file:com.bigdata.search.FullTextIndex.java
License:Open Source License
/** * Index a field in a document./*from w w w.java 2s . co m*/ * <p> * Note: This method does NOT force a write on the indices. If the <i>buffer</i> * overflows, then there will be an index write. Once the caller is done * indexing, they MUST invoke {@link TokenBuffer#flush()} to force any data * remaining in their <i>buffer</i> to the indices. * <p> * Note: If a document is pre-existing, then the existing data for that * document MUST be removed unless you know that the fields to be found in * the will not have changed (they may have different contents, but the same * fields exist in the old and new versions of the document). * * @param buffer * Used to buffer writes onto the text index. * @param docId * The document identifier. * @param fieldId * The field identifier. * @param languageCode * The language code -or- <code>null</code> to use the default * {@link Locale}. * @param r * A reader on the text to be indexed. * @param filterStopwords * if true, filter stopwords from the token stream * * @see TokenBuffer#flush() */ public void index(final TokenBuffer<V> buffer, final V docId, final int fieldId, final String languageCode, final Reader r, final boolean filterStopwords) { /* * Note: You can invoke this on a read-only index. It is only overflow * of the TokenBuffer that requires a writable index. Overflow itself * will only occur on {document,field} tuple boundaries, so it will * never overflow when indexing a search query. */ // assertWritable(); int n = 0; // tokenize (note: docId,fieldId are not on the tokenStream, but the field could be). final TokenStream tokenStream = getTokenStream(languageCode, r, filterStopwords); try { while (tokenStream.incrementToken()) { final TermAttribute term = tokenStream.getAttribute(TermAttribute.class); buffer.add(docId, fieldId, term.term()); n++; } } catch (IOException ioe) { throw new RuntimeException(ioe); } if (log.isInfoEnabled()) log.info("Indexed " + n + " tokens: docId=" + docId + ", fieldId=" + fieldId); }
From source file:com.billiger.solr.handler.component.QLTBComponent.java
License:Apache License
/** * Get analyzed version of the query string. * * This uses the analyzer for the configured FieldType for this * component to analyze and re-assemble the original query string. * If no queryFieldType is configured, the original query will be * returned.//from ww w . j a va 2s .c om * * This is used both in the prepare() stage of the component and * when reading the QLTB map data. */ String getAnalyzedQuery(String query) throws IOException { if (analyzer == null) { return query; } StringBuilder norm = new StringBuilder(); TokenStream tokens = analyzer.tokenStream("", new StringReader(query)); tokens.reset(); CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) { norm.append(termAtt.buffer(), 0, termAtt.length()); } tokens.end(); tokens.close(); return norm.toString(); }
From source file:com.bizosys.hsearch.inpipe.ComputeTokens.java
License:Apache License
private void tokenize(Doc doc, TermStream ts) throws SystemFault, ApplicationFault, IOException { if (null == ts) return;/* ww w .j a v a 2s. c o m*/ TokenStream stream = ts.stream; if (null == stream) return; DocTerms terms = doc.terms; if (null == doc.terms) { terms = new DocTerms(); doc.terms = terms; } String token = null; int offset = 0; CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class); OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class); stream.reset(); while (stream.incrementToken()) { token = termA.toString(); offset = offsetA.startOffset(); Term term = new Term(doc.tenant, token, ts.sighting, ts.type, offset); terms.getTermList().add(term); } stream.close(); }