List of usage examples for org.apache.lucene.analysis Tokenizer reset
@Override
public void reset() throws IOException
From source file:com.devb.search.IndicIndexer.java
License:Apache License
@Override public void makeIndex() { String indexPath = servletContext.getRealPath("/") + "/hindex/"; String docsPath = servletContext.getRealPath("/") + "/hdocs/"; boolean create = true; final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path\n"); return;//from w w w . jav a 2 s . c om } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'...\n"); org.apache.lucene.store.Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new HindiAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(null, analyzer); if (create) { iwc.setOpenMode(OpenMode.CREATE); } else { iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } IndexWriter writer = new IndexWriter(dir, iwc); if (docDir.canRead()) { if (docDir.isDirectory()) { String[] files = docDir.list(); if (files != null) { for (int i = 0; i < files.length; i++) { File file = new File(docDir, files[i]); FileInputStream fileInputStream = new FileInputStream(file); BufferedReader reader = new BufferedReader( new InputStreamReader(fileInputStream, "UTF-8")); Tokenizer tokenizer = new StandardTokenizer(reader); CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); tokenizer.reset(); int lineNumber = 0; try { while (tokenizer.incrementToken()) { Document doc = new Document(); Field pathField = new StringField("path", file.getName(), Field.Store.YES); doc.add(pathField); TextField nField = new TextField("linenumber", new Integer(++lineNumber).toString(), Store.YES); doc.add(nField); TextField field = new TextField("contents", termAtt.toString(), Store.YES); doc.add(field); writer.addDocument(doc); } System.out.println("Adding " + file + "\n"); } catch (Exception e) { e.printStackTrace(); } finally { tokenizer.close(); reader.close(); fileInputStream.close(); } } } } } writer.close(); Date end = new Date(); System.out.println((end.getTime() - start.getTime()) + " total milliseconds\n"); } catch (IOException e) { System.out.println("Caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
@Test public void testShortSentence() throws Exception { Tokenizer tokenizer = createTokenizer(new StringReader(" ? ?"), 2); assertEquals(//from w w w.j a va2 s . c om ":N:NNG:null:1:1:0:1,:N:NNG:null:1:1:1:3," + ":N:NNG:null:1:1:4:5,?:COMPOUND:Compound:null:0:2:4:7," + "?:N:NNG:null:1:1:5:7,?:N:NNG:null:1:1:8:12,", tokenizerToString(tokenizer)); tokenizer.reset(); tokenizer.setReader(new StringReader(" ?? .")); assertEquals(":N:NNG:null:1:1:0:2,?:N:NNG:null:1:1:3:5," + "?:COMPOUND:Compound:null:0:2:3:6,:N:NNG:null:1:1:5:6," + "?:EOJEOL:NNG+JKS:null:1:1:6:8,:N:NNG:null:0:1:6:7," + ":EOJEOL:VV+EP+EF:null:1:1:9:14,", tokenizerToString(tokenizer)); tokenizer.close(); }
From source file:com.sindicetech.siren.analysis.filter.TestMailtoFilter.java
License:Open Source License
private void assertURLDecodedTo(final Tokenizer t, final String uri, final String[] expectedStems, final String[] expectedTypes, final int[] expectedPosIncr) throws IOException { assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); final TypeAttribute typeAtt = t.getAttribute(TypeAttribute.class); assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); final PositionIncrementAttribute posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); t.setReader(new StringReader(uri)); t.reset(); final TokenFilter filter = new MailtoFilter(t); for (int i = 0; i < expectedStems.length; i++) { assertTrue("token " + i + " exists", filter.incrementToken()); assertEquals(expectedStems[i], termAtt.toString()); if (expectedTypes == null) assertEquals(uritype, typeAtt.type()); else//from ww w .ja v a 2 s. c o m assertEquals(expectedTypes[i], typeAtt.type()); if (expectedPosIncr != null) assertEquals(expectedPosIncr[i], posIncrAtt.getPositionIncrement()); } filter.end(); filter.close(); }
From source file:com.sindicetech.siren.analysis.filter.TestURIEncodingFilter.java
License:Open Source License
private void assertURLDecodedTo(final Tokenizer t, final String encoding, final String uri, final String[] expectedStems, final String[] expectedTypes, final int[] expectedPosIncr) throws IOException { assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); final TypeAttribute typeAtt = t.getAttribute(TypeAttribute.class); assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); final PositionIncrementAttribute posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); t.setReader(new StringReader(uri)); t.reset(); final URIDecodingFilter filter = new URIDecodingFilter(t, encoding); for (int i = 0; i < expectedStems.length; i++) { assertTrue("token " + i + " exists", filter.incrementToken()); assertEquals(expectedStems[i], termAtt.toString()); if (expectedTypes == null) assertEquals(uritype, typeAtt.type()); else/*from w ww . j av a2 s . c o m*/ assertEquals(expectedTypes[i], typeAtt.type()); if (expectedPosIncr != null) assertEquals(expectedPosIncr[i], posIncrAtt.getPositionIncrement()); } filter.end(); filter.close(); }
From source file:com.sindicetech.siren.analysis.filter.TestURILocalnameFilter.java
License:Open Source License
public void assertNormalisesTo(final Tokenizer t, final String input, final String[] expectedImages, final String[] expectedTypes, final int[] expectedPosIncrs) throws Exception { assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = null;//from w w w. j av a 2 s .c o m if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (expectedPosIncrs != null) { assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); } t.setReader(new StringReader(input)); t.reset(); final URILocalnameFilter filter = new URILocalnameFilter(t); filter.setMaxLength(MAX_LENGTH); for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", filter.incrementToken()); assertEquals(expectedImages[i], termAtt.toString()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } if (expectedPosIncrs != null) { assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); } } assertFalse("end of stream", filter.incrementToken()); filter.end(); filter.close(); }
From source file:com.sindicetech.siren.analysis.filter.TestURINormalisationFilter.java
License:Open Source License
public void assertNormalisesTo(final Tokenizer t, final String input, final String[] expectedImages, final String[] expectedTypes) throws Exception { assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = null;//from ww w . java2 s . c o m if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } t.setReader(new StringReader(input)); t.reset(); final TokenStream filter = new URINormalisationFilter(t); for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", filter.incrementToken()); assertEquals(expectedImages[i], termAtt.toString()); if (expectedTypes != null) { assertEquals(expectedTypes[i], typeAtt.type()); } } assertFalse("end of stream", filter.incrementToken()); filter.end(); filter.close(); }
From source file:com.sindicetech.siren.analysis.NodeTokenizerTestCase.java
License:Open Source License
/** * Execute the tokenizer on the given input. Used to validate the exceptions thrown by the tokenizer. *///from w w w .ja va 2 s. c om protected void assertTokenizesTo(final Tokenizer t, final String input) throws Exception { t.setReader(new StringReader(input)); t.reset(); // reset the stream for the new reader while (t.incrementToken()) { // do nothing } t.end(); t.close(); }
From source file:com.sindicetech.siren.analysis.NodeTokenizerTestCase.java
License:Open Source License
protected void assertTokenizesTo(final Tokenizer t, final String input, final String[] expectedImages, final String[] expectedFields, final String[] expectedTypes, final String[] expectedDatatypes, final int[] expectedPosIncrs, final IntsRef[] expectedNode) throws Exception { assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class)); final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); PathAttribute fieldAtt = null;// w ww .ja v a 2s .c o m if (expectedFields != null) { assertTrue("has FieldAttribute", t.hasAttribute(PathAttribute.class)); fieldAtt = t.getAttribute(PathAttribute.class); } TypeAttribute typeAtt = null; if (expectedTypes != null) { assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class)); typeAtt = t.getAttribute(TypeAttribute.class); } DatatypeAttribute dtypeAtt = null; if (expectedDatatypes != null) { assertTrue("has DatatypeAttribute", t.hasAttribute(DatatypeAttribute.class)); dtypeAtt = t.getAttribute(DatatypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (expectedPosIncrs != null) { assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); } NodeAttribute nodeAtt = null; if (expectedNode != null) { assertTrue("has NodeAttribute", t.hasAttribute(NodeAttribute.class)); nodeAtt = t.getAttribute(NodeAttribute.class); } t.setReader(new StringReader(input)); t.reset(); // reset the stream for the new reader for (int i = 0; i < expectedImages.length; i++) { assertTrue("token " + i + " exists", t.incrementToken()); assertEquals("i=" + i, expectedImages[i], termAtt.toString()); if (expectedFields != null) { assertEquals("i=" + i, expectedFields[i], fieldAtt.field()); } if (expectedTypes != null) { assertEquals("i=" + i, expectedTypes[i], typeAtt.type()); } if (expectedDatatypes != null) { assertEquals("i=" + i, expectedDatatypes[i], dtypeAtt.datatypeURI() == null ? "" : String.valueOf(dtypeAtt.datatypeURI())); } if (expectedPosIncrs != null) { assertEquals("i=" + i, expectedPosIncrs[i], posIncrAtt.getPositionIncrement()); } if (expectedNode != null) { assertEquals("i=" + i, expectedNode[i], nodeAtt.node()); } } assertFalse("end of stream", t.incrementToken()); t.end(); t.close(); }
From source file:com.sindicetech.siren.analysis.NodeTokenizerTestCase.java
License:Open Source License
protected void print(final Tokenizer t, final String input) throws Exception { final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = t.getAttribute(TypeAttribute.class); DatatypeAttribute dtypeAtt = t.getAttribute(DatatypeAttribute.class); PositionIncrementAttribute posIncrAtt = t.getAttribute(PositionIncrementAttribute.class); NodeAttribute nodeAtt = t.getAttribute(NodeAttribute.class); PathAttribute fieldAtt = t.getAttribute(PathAttribute.class); t.setReader(new StringReader(input)); t.reset(); // reset the stream for the new reader StringBuilder builder = new StringBuilder(); while (t.incrementToken()) { builder.setLength(0);//from www . jav a 2 s . c o m builder.append(fieldAtt.field()); builder.append(", "); builder.append(termAtt.toString()); builder.append(", "); builder.append(typeAtt.type()); builder.append(", "); builder.append(dtypeAtt.datatypeURI() == null ? "" : String.valueOf(dtypeAtt.datatypeURI())); builder.append(", "); builder.append(posIncrAtt.getPositionIncrement()); builder.append(", "); builder.append(nodeAtt.node()); System.out.println(builder.toString()); } t.end(); t.close(); }
From source file:fi.nationallibrary.ndl.solrvoikko2.TestApp.java
License:Open Source License
public static void main(String[] args) throws IOException { BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in)); Voikko voikko = null;//from w ww . j a v a2 s. c o m try { ConcurrentMap<String, List<CompoundToken>> cache = new ConcurrentLinkedHashMap.Builder<String, List<CompoundToken>>() .maximumWeightedCapacity(100).build(); voikko = new Voikko("fi-x-morphoid"); StringReader reader = new StringReader(""); Tokenizer tokenizer = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); tokenizer.setReader(reader); tokenizer.reset(); voikko = new Voikko("fi-x-morphoid"); VoikkoFilter voikkoFilter = new VoikkoFilter(tokenizer, voikko, true, VoikkoFilter.DEFAULT_MIN_WORD_SIZE, VoikkoFilter.DEFAULT_MIN_SUBWORD_SIZE, VoikkoFilter.DEFAULT_MAX_SUBWORD_SIZE, true, cache, 0); String text; System.out.println(); System.out.println("Enter word or phrase"); while ((text = stdin.readLine()) != null) { List<Analysis> analysisList = voikko.analyze(text); if (analysisList.isEmpty()) { System.out.println("No analysis available"); } for (Analysis analysis : analysisList) { System.out.println("Analysis:"); if (analysis.containsKey(BASEFORM)) { WordComponent component = new WordComponent(); component.component = analysis.get(BASEFORM); component.startInOriginal = 0; component.lengthInOriginal = text.length(); print(component); } if (analysis.containsKey(WORDBASES)) { System.out.println(analysis.get(WORDBASES)); } } tokenizer.close(); reader = new StringReader(text); tokenizer.setReader(reader); tokenizer.reset(); System.out.println("\nVoikkoFilter results:"); while (voikkoFilter.incrementToken()) { System.out.println( voikkoFilter.termAtt.toString() + " [" + voikkoFilter.posIncAtt.getPositionIncrement() + ":" + voikkoFilter.offsetAtt.startOffset() + ":" + voikkoFilter.offsetAtt.endOffset() + "]"); } System.out.println(); System.out.println("Enter word or phrase"); } voikkoFilter.close(); } finally { voikko.terminate(); } }