Example usage for org.apache.lucene.analysis Tokenizer close

List of usage examples for org.apache.lucene.analysis Tokenizer close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Tokenizer close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

NOTE: The default implementation closes the input Reader, so be sure to call super.close() when overriding this method.

Usage

From source file:com.devb.search.IndicIndexer.java

License:Apache License

@Override
public void makeIndex() {
    String indexPath = servletContext.getRealPath("/") + "/hindex/";
    String docsPath = servletContext.getRealPath("/") + "/hdocs/";
    boolean create = true;

    final File docDir = new File(docsPath);
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, please check the path\n");
        return;//from  w  w  w . j av  a  2  s . c  o  m
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...\n");

        org.apache.lucene.store.Directory dir = FSDirectory.open(new File(indexPath));
        Analyzer analyzer = new HindiAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(null, analyzer);

        if (create) {
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        IndexWriter writer = new IndexWriter(dir, iwc);
        if (docDir.canRead()) {
            if (docDir.isDirectory()) {
                String[] files = docDir.list();
                if (files != null) {
                    for (int i = 0; i < files.length; i++) {
                        File file = new File(docDir, files[i]);
                        FileInputStream fileInputStream = new FileInputStream(file);
                        BufferedReader reader = new BufferedReader(
                                new InputStreamReader(fileInputStream, "UTF-8"));
                        Tokenizer tokenizer = new StandardTokenizer(reader);
                        CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
                        tokenizer.reset();
                        int lineNumber = 0;
                        try {
                            while (tokenizer.incrementToken()) {
                                Document doc = new Document();
                                Field pathField = new StringField("path", file.getName(), Field.Store.YES);
                                doc.add(pathField);
                                TextField nField = new TextField("linenumber",
                                        new Integer(++lineNumber).toString(), Store.YES);
                                doc.add(nField);
                                TextField field = new TextField("contents", termAtt.toString(), Store.YES);
                                doc.add(field);
                                writer.addDocument(doc);
                            }
                            System.out.println("Adding " + file + "\n");
                        } catch (Exception e) {
                            e.printStackTrace();
                        } finally {
                            tokenizer.close();
                            reader.close();
                            fileInputStream.close();
                        }
                    }
                }
            }
        }

        writer.close();

        Date end = new Date();
        System.out.println((end.getTime() - start.getTime()) + " total milliseconds\n");

    } catch (IOException e) {
        System.out.println("Caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

@Test
public void testEmptyQuery() throws Exception {
    Tokenizer tokenizer = createTokenizer(new StringReader(""),
            TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
    assertEquals(false, tokenizer.incrementToken());
    tokenizer.close();
}

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

@Test
public void testEmptyMorphemes() throws Exception {
    Tokenizer tokenizer = createTokenizer(new StringReader("!@#$%^&*"),
            TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
    assertEquals(false, tokenizer.incrementToken());
    tokenizer.close();
}

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

@Test
public void testShortSentence() throws Exception {
    Tokenizer tokenizer = createTokenizer(new StringReader(" ? ?"), 2);
    assertEquals(//from   ww  w .j a v  a2s  .  co  m
            ":N:NNG:null:1:1:0:1,:N:NNG:null:1:1:1:3,"
                    + ":N:NNG:null:1:1:4:5,?:COMPOUND:Compound:null:0:2:4:7,"
                    + "?:N:NNG:null:1:1:5:7,?:N:NNG:null:1:1:8:12,",
            tokenizerToString(tokenizer));

    tokenizer.reset();
    tokenizer.setReader(new StringReader(" ?? ."));
    assertEquals(":N:NNG:null:1:1:0:2,?:N:NNG:null:1:1:3:5,"
            + "?:COMPOUND:Compound:null:0:2:3:6,:N:NNG:null:1:1:5:6,"
            + "?:EOJEOL:NNG+JKS:null:1:1:6:8,:N:NNG:null:0:1:6:7,"
            + ":EOJEOL:VV+EP+EF:null:1:1:9:14,", tokenizerToString(tokenizer));
    tokenizer.close();
}

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

@Ignore
public void testComplexSentence() throws Exception {
    Tokenizer tokenizer = createTokenizer(
            new StringReader(
                    "  ??     ? "
                            + "??  ?? ."),
            TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
    assertEquals(":EOJEOL:1:1:0:4,:N:0:1:0:2,:EOJEOL:1:1:5:8,"
            + "??:EOJEOL:1:1:9:13,?:XR:0:1:9:11,:N:1:1:14:16,"
            + ":EOJEOL:1:1:17:21,:N:0:1:17:20,:EOJEOL:1:1:22:25,"
            + ":N:0:1:22:24,:N:1:1:26:27,:COMPOUND:0:2:26:29,"
            + ":N:1:1:27:29,?:EOJEOL:1:1:30:33,??:EOJEOL:1:1:34:37,"
            + "?:MAG:0:1:34:36,:MM:1:1:38:39,??:EOJEOL:1:1:40:42,"
            + "?:N:0:1:40:41,:INFLECT:1:1:43:47,", tokenizerToString(tokenizer));
    tokenizer.close();
}

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

@Test
public void testHanEnglish() throws Exception {
    Tokenizer tokenizer = createTokenizer(new StringReader("win"),
            TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
    assertEquals(":N:NNG:null:1:1:0:2,win:SL:SL:null:1:1:2:5,", tokenizerToString(tokenizer));
    tokenizer.close();
}

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

@Test
public void testDecompound() throws Exception {
    Tokenizer tokenizer = createTokenizer(new StringReader(""),
            TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
    assertEquals(":N:NNG:null:1:1:0:2,:COMPOUND:Compound:null:0:2:0:3,:N:NNG:null:1:1:2:3,",
            tokenizerToString(tokenizer));
    tokenizer.close();

    tokenizer = createTokenizer(new StringReader(""),
            TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
    assertEquals(":N:NNG:null:1:1:0:2,:COMPOUND:Compound:null:0:2:0:4,"
            + ":N:NNG:null:1:1:2:4,", tokenizerToString(tokenizer));
    tokenizer.close();/*from   w w w . j  a  va 2 s. co m*/
}

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

@Test
public void testNoDecompound() throws Exception {
    Tokenizer tokenizer = createTokenizer(new StringReader(""), TokenGenerator.NO_DECOMPOUND);
    assertEquals(":COMPOUND:NNG:null:1:2:0:3,", tokenizerToString(tokenizer));
    tokenizer.close();

    tokenizer = createTokenizer(new StringReader(""), TokenGenerator.NO_DECOMPOUND);
    assertEquals(":COMPOUND:NNG:null:1:2:0:4,", tokenizerToString(tokenizer));
    tokenizer.close();/*w w w  .  j  a  va2  s . c  o m*/
}

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

@Test
public void testPreanalysisSentence() throws Exception {
    Tokenizer tokenizer = createTokenizer(new StringReader("? ? ?."),
            TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
    assertEquals("?:N:NNG:null:1:1:0:2,:N:NR:null:1:1:2:3,:N:NNG:null:1:1:3:4,"
            + "?:EOJEOL:NNG+JX:null:1:1:5:10,?:N:NNG:null:0:1:5:9,"
            + ":N:NNG:null:1:1:11:13,?:EOJEOL:NNG+VCP+EF:null:1:1:13:17,"
            + ":N:NNG:null:0:1:13:15,", tokenizerToString(tokenizer));
    tokenizer.close();
}

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

@Test
public void testUnknownSurface() throws Exception {
    Tokenizer tokenizer = createTokenizer(new StringReader("  "),
            TokenGenerator.DEFAULT_COMPOUND_NOUN_MIN_LENGTH);
    assertEquals(":UNKNOWN:UNKNOWN:null:1:1:0:2,:EOJEOL:VA+ETM:null:1:1:3:5,"
            + ":N:NNG:null:1:1:6:8,", tokenizerToString(tokenizer));
    tokenizer.close();
}