List of usage examples for org.apache.lucene.analysis Tokenizer close
@Override public void close() throws IOException
NOTE: The default implementation closes the input Reader, so be sure to call super.close() when overriding this method.
From source file:com.devb.search.IndicIndexer.java
License:Apache License
@Override public void makeIndex() { String indexPath = servletContext.getRealPath("/") + "/hindex/"; String docsPath = servletContext.getRealPath("/") + "/hdocs/"; boolean create = true; final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path\n"); return;//from w w w . j av a 2 s . c o m } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'...\n"); org.apache.lucene.store.Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new HindiAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(null, analyzer); if (create) { iwc.setOpenMode(OpenMode.CREATE); } else { iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } IndexWriter writer = new IndexWriter(dir, iwc); if (docDir.canRead()) { if (docDir.isDirectory()) { String[] files = docDir.list(); if (files != null) { for (int i = 0; i < files.length; i++) { File file = new File(docDir, files[i]); FileInputStream fileInputStream = new FileInputStream(file); BufferedReader reader = new BufferedReader( new InputStreamReader(fileInputStream, "UTF-8")); Tokenizer tokenizer = new StandardTokenizer(reader); CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); tokenizer.reset(); int lineNumber = 0; try { while (tokenizer.incrementToken()) { Document doc = new Document(); Field pathField = new StringField("path", file.getName(), Field.Store.YES); doc.add(pathField); TextField nField = new TextField("linenumber", new Integer(++lineNumber).toString(), Store.YES); doc.add(nField); TextField field = new TextField("contents", termAtt.toString(), Store.YES); doc.add(field); writer.addDocument(doc); } System.out.println("Adding " + file + "\n"); } catch (Exception e) { e.printStackTrace(); } finally { tokenizer.close(); reader.close(); fileInputStream.close(); } } } } } writer.close(); Date end = new Date(); System.out.println((end.getTime() - start.getTime()) + " total milliseconds\n"); } catch (IOException e) { System.out.println("Caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
@Test public void testEmptyQuery() throws Exception { Tokenizer tokenizer = createTokenizer(new StringReader(""), TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH); assertEquals(false, tokenizer.incrementToken()); tokenizer.close(); }
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
@Test public void testEmptyMorphemes() throws Exception { Tokenizer tokenizer = createTokenizer(new StringReader("!@#$%^&*"), TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH); assertEquals(false, tokenizer.incrementToken()); tokenizer.close(); }
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
@Test public void testShortSentence() throws Exception { Tokenizer tokenizer = createTokenizer(new StringReader(" ? ?"), 2); assertEquals(//from ww w .j a v a2s . co m ":N:NNG:null:1:1:0:1,:N:NNG:null:1:1:1:3," + ":N:NNG:null:1:1:4:5,?:COMPOUND:Compound:null:0:2:4:7," + "?:N:NNG:null:1:1:5:7,?:N:NNG:null:1:1:8:12,", tokenizerToString(tokenizer)); tokenizer.reset(); tokenizer.setReader(new StringReader(" ?? .")); assertEquals(":N:NNG:null:1:1:0:2,?:N:NNG:null:1:1:3:5," + "?:COMPOUND:Compound:null:0:2:3:6,:N:NNG:null:1:1:5:6," + "?:EOJEOL:NNG+JKS:null:1:1:6:8,:N:NNG:null:0:1:6:7," + ":EOJEOL:VV+EP+EF:null:1:1:9:14,", tokenizerToString(tokenizer)); tokenizer.close(); }
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
@Ignore public void testComplexSentence() throws Exception { Tokenizer tokenizer = createTokenizer( new StringReader( " ?? ? " + "?? ?? ."), TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH); assertEquals(":EOJEOL:1:1:0:4,:N:0:1:0:2,:EOJEOL:1:1:5:8," + "??:EOJEOL:1:1:9:13,?:XR:0:1:9:11,:N:1:1:14:16," + ":EOJEOL:1:1:17:21,:N:0:1:17:20,:EOJEOL:1:1:22:25," + ":N:0:1:22:24,:N:1:1:26:27,:COMPOUND:0:2:26:29," + ":N:1:1:27:29,?:EOJEOL:1:1:30:33,??:EOJEOL:1:1:34:37," + "?:MAG:0:1:34:36,:MM:1:1:38:39,??:EOJEOL:1:1:40:42," + "?:N:0:1:40:41,:INFLECT:1:1:43:47,", tokenizerToString(tokenizer)); tokenizer.close(); }
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
@Test public void testHanEnglish() throws Exception { Tokenizer tokenizer = createTokenizer(new StringReader("win"), TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH); assertEquals(":N:NNG:null:1:1:0:2,win:SL:SL:null:1:1:2:5,", tokenizerToString(tokenizer)); tokenizer.close(); }
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
@Test public void testDecompound() throws Exception { Tokenizer tokenizer = createTokenizer(new StringReader(""), TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH); assertEquals(":N:NNG:null:1:1:0:2,:COMPOUND:Compound:null:0:2:0:3,:N:NNG:null:1:1:2:3,", tokenizerToString(tokenizer)); tokenizer.close(); tokenizer = createTokenizer(new StringReader(""), TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH); assertEquals(":N:NNG:null:1:1:0:2,:COMPOUND:Compound:null:0:2:0:4," + ":N:NNG:null:1:1:2:4,", tokenizerToString(tokenizer)); tokenizer.close();/*from w w w . j a va 2 s. co m*/ }
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
@Test public void testNoDecompound() throws Exception { Tokenizer tokenizer = createTokenizer(new StringReader(""), TokenGenerator.NO_DECOMPOUND); assertEquals(":COMPOUND:NNG:null:1:2:0:3,", tokenizerToString(tokenizer)); tokenizer.close(); tokenizer = createTokenizer(new StringReader(""), TokenGenerator.NO_DECOMPOUND); assertEquals(":COMPOUND:NNG:null:1:2:0:4,", tokenizerToString(tokenizer)); tokenizer.close();/*w w w . j a va2 s . c o m*/ }
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
@Test public void testPreanalysisSentence() throws Exception { Tokenizer tokenizer = createTokenizer(new StringReader("? ? ?."), TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH); assertEquals("?:N:NNG:null:1:1:0:2,:N:NR:null:1:1:2:3,:N:NNG:null:1:1:3:4," + "?:EOJEOL:NNG+JX:null:1:1:5:10,?:N:NNG:null:0:1:5:9," + ":N:NNG:null:1:1:11:13,?:EOJEOL:NNG+VCP+EF:null:1:1:13:17," + ":N:NNG:null:0:1:13:15,", tokenizerToString(tokenizer)); tokenizer.close(); }
From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java
License:Apache License
@Test public void testUnknownSurface() throws Exception { Tokenizer tokenizer = createTokenizer(new StringReader(" "), TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH); assertEquals(":UNKNOWN:UNKNOWN:null:1:1:0:2,:EOJEOL:VA+ETM:null:1:1:3:5," + ":N:NNG:null:1:1:6:8,", tokenizerToString(tokenizer)); tokenizer.close(); }