Example usage for org.apache.lucene.analysis Tokenizer reset

List of usage examples for org.apache.lucene.analysis Tokenizer reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Tokenizer reset.

Prototype

@Override
    public void reset() throws IOException 

Source Link

Usage

From source file:com.devb.search.IndicIndexer.java

License:Apache License

@Override
public void makeIndex() {
    String indexPath = servletContext.getRealPath("/") + "/hindex/";
    String docsPath = servletContext.getRealPath("/") + "/hdocs/";
    boolean create = true;

    final File docDir = new File(docsPath);
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, please check the path\n");
        return;//from   w  w  w  .  jav  a 2 s  .  c om
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...\n");

        org.apache.lucene.store.Directory dir = FSDirectory.open(new File(indexPath));
        Analyzer analyzer = new HindiAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(null, analyzer);

        if (create) {
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        IndexWriter writer = new IndexWriter(dir, iwc);
        if (docDir.canRead()) {
            if (docDir.isDirectory()) {
                String[] files = docDir.list();
                if (files != null) {
                    for (int i = 0; i < files.length; i++) {
                        File file = new File(docDir, files[i]);
                        FileInputStream fileInputStream = new FileInputStream(file);
                        BufferedReader reader = new BufferedReader(
                                new InputStreamReader(fileInputStream, "UTF-8"));
                        Tokenizer tokenizer = new StandardTokenizer(reader);
                        CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
                        tokenizer.reset();
                        int lineNumber = 0;
                        try {
                            while (tokenizer.incrementToken()) {
                                Document doc = new Document();
                                Field pathField = new StringField("path", file.getName(), Field.Store.YES);
                                doc.add(pathField);
                                TextField nField = new TextField("linenumber",
                                        new Integer(++lineNumber).toString(), Store.YES);
                                doc.add(nField);
                                TextField field = new TextField("contents", termAtt.toString(), Store.YES);
                                doc.add(field);
                                writer.addDocument(doc);
                            }
                            System.out.println("Adding " + file + "\n");
                        } catch (Exception e) {
                            e.printStackTrace();
                        } finally {
                            tokenizer.close();
                            reader.close();
                            fileInputStream.close();
                        }
                    }
                }
            }
        }

        writer.close();

        Date end = new Date();
        System.out.println((end.getTime() - start.getTime()) + " total milliseconds\n");

    } catch (IOException e) {
        System.out.println("Caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:com.github.bibreen.mecab_ko_lucene_analyzer.MeCabKoStandardTokenizerTest.java

License:Apache License

@Test
public void testShortSentence() throws Exception {
    Tokenizer tokenizer = createTokenizer(new StringReader(" ? ?"), 2);
    assertEquals(//from w w w.j  a  va2 s  . c om
            ":N:NNG:null:1:1:0:1,:N:NNG:null:1:1:1:3,"
                    + ":N:NNG:null:1:1:4:5,?:COMPOUND:Compound:null:0:2:4:7,"
                    + "?:N:NNG:null:1:1:5:7,?:N:NNG:null:1:1:8:12,",
            tokenizerToString(tokenizer));

    tokenizer.reset();
    tokenizer.setReader(new StringReader(" ?? ."));
    assertEquals(":N:NNG:null:1:1:0:2,?:N:NNG:null:1:1:3:5,"
            + "?:COMPOUND:Compound:null:0:2:3:6,:N:NNG:null:1:1:5:6,"
            + "?:EOJEOL:NNG+JKS:null:1:1:6:8,:N:NNG:null:0:1:6:7,"
            + ":EOJEOL:VV+EP+EF:null:1:1:9:14,", tokenizerToString(tokenizer));
    tokenizer.close();
}

From source file:com.sindicetech.siren.analysis.filter.TestMailtoFilter.java

License:Open Source License

private void assertURLDecodedTo(final Tokenizer t, final String uri, final String[] expectedStems,
        final String[] expectedTypes, final int[] expectedPosIncr) throws IOException {
    assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
    final TypeAttribute typeAtt = t.getAttribute(TypeAttribute.class);

    assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
    final PositionIncrementAttribute posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);

    t.setReader(new StringReader(uri));
    t.reset();

    final TokenFilter filter = new MailtoFilter(t);
    for (int i = 0; i < expectedStems.length; i++) {
        assertTrue("token " + i + " exists", filter.incrementToken());
        assertEquals(expectedStems[i], termAtt.toString());
        if (expectedTypes == null)
            assertEquals(uritype, typeAtt.type());
        else//from  ww w .ja  v  a 2 s.  c  o  m
            assertEquals(expectedTypes[i], typeAtt.type());
        if (expectedPosIncr != null)
            assertEquals(expectedPosIncr[i], posIncrAtt.getPositionIncrement());
    }
    filter.end();
    filter.close();
}

From source file:com.sindicetech.siren.analysis.filter.TestURIEncodingFilter.java

License:Open Source License

private void assertURLDecodedTo(final Tokenizer t, final String encoding, final String uri,
        final String[] expectedStems, final String[] expectedTypes, final int[] expectedPosIncr)
        throws IOException {
    assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
    final TypeAttribute typeAtt = t.getAttribute(TypeAttribute.class);

    assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
    final PositionIncrementAttribute posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);

    t.setReader(new StringReader(uri));
    t.reset();

    final URIDecodingFilter filter = new URIDecodingFilter(t, encoding);
    for (int i = 0; i < expectedStems.length; i++) {
        assertTrue("token " + i + " exists", filter.incrementToken());
        assertEquals(expectedStems[i], termAtt.toString());
        if (expectedTypes == null)
            assertEquals(uritype, typeAtt.type());
        else/*from  w ww  . j av a2  s .  c  o  m*/
            assertEquals(expectedTypes[i], typeAtt.type());
        if (expectedPosIncr != null)
            assertEquals(expectedPosIncr[i], posIncrAtt.getPositionIncrement());
    }
    filter.end();
    filter.close();
}

From source file:com.sindicetech.siren.analysis.filter.TestURILocalnameFilter.java

License:Open Source License

public void assertNormalisesTo(final Tokenizer t, final String input, final String[] expectedImages,
        final String[] expectedTypes, final int[] expectedPosIncrs) throws Exception {

    assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    TypeAttribute typeAtt = null;//from  w w  w.  j av  a 2  s .c o m
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (expectedPosIncrs != null) {
        assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
    }

    t.setReader(new StringReader(input));
    t.reset();

    final URILocalnameFilter filter = new URILocalnameFilter(t);
    filter.setMaxLength(MAX_LENGTH);

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", filter.incrementToken());

        assertEquals(expectedImages[i], termAtt.toString());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

        if (expectedPosIncrs != null) {
            assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
        }

    }

    assertFalse("end of stream", filter.incrementToken());
    filter.end();
    filter.close();
}

From source file:com.sindicetech.siren.analysis.filter.TestURINormalisationFilter.java

License:Open Source License

public void assertNormalisesTo(final Tokenizer t, final String input, final String[] expectedImages,
        final String[] expectedTypes) throws Exception {

    assertTrue("has CharTermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    TypeAttribute typeAtt = null;//from   ww w .  java2 s .  c o  m
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    t.setReader(new StringReader(input));
    t.reset();

    final TokenStream filter = new URINormalisationFilter(t);

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", filter.incrementToken());

        assertEquals(expectedImages[i], termAtt.toString());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

    }

    assertFalse("end of stream", filter.incrementToken());
    filter.end();
    filter.close();
}

From source file:com.sindicetech.siren.analysis.NodeTokenizerTestCase.java

License:Open Source License

/**
 * Execute the tokenizer on the given input. Used to validate the exceptions thrown by the tokenizer.
 *///from  w w  w .ja va 2 s. c  om
protected void assertTokenizesTo(final Tokenizer t, final String input) throws Exception {
    t.setReader(new StringReader(input));
    t.reset(); // reset the stream for the new reader

    while (t.incrementToken()) {
        // do nothing
    }

    t.end();
    t.close();
}

From source file:com.sindicetech.siren.analysis.NodeTokenizerTestCase.java

License:Open Source License

protected void assertTokenizesTo(final Tokenizer t, final String input, final String[] expectedImages,
        final String[] expectedFields, final String[] expectedTypes, final String[] expectedDatatypes,
        final int[] expectedPosIncrs, final IntsRef[] expectedNode) throws Exception {

    assertTrue("has TermAttribute", t.hasAttribute(CharTermAttribute.class));
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    PathAttribute fieldAtt = null;//  w ww .ja  v  a  2s .c o  m
    if (expectedFields != null) {
        assertTrue("has FieldAttribute", t.hasAttribute(PathAttribute.class));
        fieldAtt = t.getAttribute(PathAttribute.class);
    }

    TypeAttribute typeAtt = null;
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    DatatypeAttribute dtypeAtt = null;
    if (expectedDatatypes != null) {
        assertTrue("has DatatypeAttribute", t.hasAttribute(DatatypeAttribute.class));
        dtypeAtt = t.getAttribute(DatatypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (expectedPosIncrs != null) {
        assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
    }

    NodeAttribute nodeAtt = null;
    if (expectedNode != null) {
        assertTrue("has NodeAttribute", t.hasAttribute(NodeAttribute.class));
        nodeAtt = t.getAttribute(NodeAttribute.class);
    }

    t.setReader(new StringReader(input));
    t.reset(); // reset the stream for the new reader

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", t.incrementToken());

        assertEquals("i=" + i, expectedImages[i], termAtt.toString());

        if (expectedFields != null) {
            assertEquals("i=" + i, expectedFields[i], fieldAtt.field());
        }

        if (expectedTypes != null) {
            assertEquals("i=" + i, expectedTypes[i], typeAtt.type());
        }

        if (expectedDatatypes != null) {
            assertEquals("i=" + i, expectedDatatypes[i],
                    dtypeAtt.datatypeURI() == null ? "" : String.valueOf(dtypeAtt.datatypeURI()));
        }

        if (expectedPosIncrs != null) {
            assertEquals("i=" + i, expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
        }

        if (expectedNode != null) {
            assertEquals("i=" + i, expectedNode[i], nodeAtt.node());
        }

    }

    assertFalse("end of stream", t.incrementToken());
    t.end();
    t.close();
}

From source file:com.sindicetech.siren.analysis.NodeTokenizerTestCase.java

License:Open Source License

protected void print(final Tokenizer t, final String input) throws Exception {
    final CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);

    TypeAttribute typeAtt = t.getAttribute(TypeAttribute.class);

    DatatypeAttribute dtypeAtt = t.getAttribute(DatatypeAttribute.class);

    PositionIncrementAttribute posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);

    NodeAttribute nodeAtt = t.getAttribute(NodeAttribute.class);

    PathAttribute fieldAtt = t.getAttribute(PathAttribute.class);

    t.setReader(new StringReader(input));
    t.reset(); // reset the stream for the new reader

    StringBuilder builder = new StringBuilder();
    while (t.incrementToken()) {
        builder.setLength(0);//from  www .  jav  a 2  s  .  c  o  m
        builder.append(fieldAtt.field());
        builder.append(", ");
        builder.append(termAtt.toString());
        builder.append(", ");
        builder.append(typeAtt.type());
        builder.append(", ");
        builder.append(dtypeAtt.datatypeURI() == null ? "" : String.valueOf(dtypeAtt.datatypeURI()));
        builder.append(", ");
        builder.append(posIncrAtt.getPositionIncrement());
        builder.append(", ");
        builder.append(nodeAtt.node());
        System.out.println(builder.toString());
    }

    t.end();
    t.close();
}

From source file:fi.nationallibrary.ndl.solrvoikko2.TestApp.java

License:Open Source License

public static void main(String[] args) throws IOException {
    BufferedReader stdin = new BufferedReader(new InputStreamReader(System.in));
    Voikko voikko = null;//from w ww .  j  a  v a2  s.  c o  m
    try {
        ConcurrentMap<String, List<CompoundToken>> cache = new ConcurrentLinkedHashMap.Builder<String, List<CompoundToken>>()
                .maximumWeightedCapacity(100).build();

        voikko = new Voikko("fi-x-morphoid");

        StringReader reader = new StringReader("");
        Tokenizer tokenizer = new StandardTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY);
        tokenizer.setReader(reader);
        tokenizer.reset();

        voikko = new Voikko("fi-x-morphoid");
        VoikkoFilter voikkoFilter = new VoikkoFilter(tokenizer, voikko, true,
                VoikkoFilter.DEFAULT_MIN_WORD_SIZE, VoikkoFilter.DEFAULT_MIN_SUBWORD_SIZE,
                VoikkoFilter.DEFAULT_MAX_SUBWORD_SIZE, true, cache, 0);

        String text;
        System.out.println();
        System.out.println("Enter word or phrase");
        while ((text = stdin.readLine()) != null) {
            List<Analysis> analysisList = voikko.analyze(text);
            if (analysisList.isEmpty()) {
                System.out.println("No analysis available");
            }
            for (Analysis analysis : analysisList) {
                System.out.println("Analysis:");
                if (analysis.containsKey(BASEFORM)) {
                    WordComponent component = new WordComponent();
                    component.component = analysis.get(BASEFORM);
                    component.startInOriginal = 0;
                    component.lengthInOriginal = text.length();
                    print(component);
                }
                if (analysis.containsKey(WORDBASES)) {
                    System.out.println(analysis.get(WORDBASES));
                }
            }

            tokenizer.close();
            reader = new StringReader(text);
            tokenizer.setReader(reader);
            tokenizer.reset();

            System.out.println("\nVoikkoFilter results:");
            while (voikkoFilter.incrementToken()) {
                System.out.println(
                        voikkoFilter.termAtt.toString() + " [" + voikkoFilter.posIncAtt.getPositionIncrement()
                                + ":" + voikkoFilter.offsetAtt.startOffset() + ":"
                                + voikkoFilter.offsetAtt.endOffset() + "]");
            }

            System.out.println();
            System.out.println("Enter word or phrase");
        }
        voikkoFilter.close();
    } finally {
        voikko.terminate();
    }
}