Example usage for org.apache.lucene.analysis.th ThaiAnalyzer ThaiAnalyzer

List of usage examples for org.apache.lucene.analysis.th ThaiAnalyzer ThaiAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.th ThaiAnalyzer ThaiAnalyzer.

Prototype

public ThaiAnalyzer(CharArraySet stopwords) 

Source Link

Document

Builds an analyzer with the given stop words.

Usage

From source file:com.bigdata.search.DefaultAnalyzerFactory.java

License:Open Source License

/**
 * Initializes the various kinds of analyzers that we know about.
 * <p>/*from www .j  a  v  a2  s.c  o m*/
 * Note: Each {@link Analyzer} is registered under both the 3 letter and the
 * 2 letter language codes. See <a
 * href="http://www.loc.gov/standards/iso639-2/php/code_list.php">ISO 639-2</a>.
 * 
 * @todo get some informed advice on which {@link Analyzer}s map onto which
 *       language codes.
 * 
 * @todo thread safety? Analyzers produce token processors so maybe there is
 *       no problem here once things are initialized. If so, maybe this
 *       could be static.
 * 
 * @todo configuration. Could be configured by a file containing a class
 *       name and a list of codes that are handled by that class.
 * 
 * @todo strip language code down to 2/3 characters during lookup.
 * 
 * @todo There are a lot of pidgins based on french, english, and other
 *       languages that are not being assigned here.
 */
synchronized private Map<String, AnalyzerConstructor> getAnalyzers() {

    if (analyzers != null) {

        return analyzers;

    }

    analyzers = new HashMap<String, AnalyzerConstructor>();

    final Set<?> emptyStopwords = Collections.EMPTY_SET;

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new BrazilianAnalyzer(Version.LUCENE_CURRENT)
                        : new BrazilianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("por", a);
        analyzers.put("pt", a);
    }

    /*
     * Claims to handle Chinese. Does single character extraction. Claims to
     * produce smaller indices as a result.
     * 
     * Note: you can not tokenize with the Chinese analyzer and the do
     * search using the CJK analyzer and visa versa.
     * 
     * Note: I have no idea whether this would work for Japanese and Korean
     * as well. I expect so, but no real clue.
     */
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return new ChineseAnalyzer();
            }
        };
        analyzers.put("zho", a);
        analyzers.put("chi", a);
        analyzers.put("zh", a);
    }

    /*
     * Claims to handle Chinese, Japanese, Korean. Does double character
     * extraction with overlap.
     */
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new CJKAnalyzer(Version.LUCENE_CURRENT)
                        : new CJKAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        //            analyzers.put("zho", a);
        //            analyzers.put("chi", a);
        //            analyzers.put("zh", a);
        analyzers.put("jpn", a);
        analyzers.put("ja", a);
        analyzers.put("jpn", a);
        analyzers.put("kor", a);
        analyzers.put("ko", a);
    }

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new CzechAnalyzer(Version.LUCENE_CURRENT)
                        : new CzechAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("ces", a);
        analyzers.put("cze", a);
        analyzers.put("cs", a);
    }

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new DutchAnalyzer(Version.LUCENE_CURRENT)
                        : new DutchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("dut", a);
        analyzers.put("nld", a);
        analyzers.put("nl", a);
    }

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new FrenchAnalyzer(Version.LUCENE_CURRENT)
                        : new FrenchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("fra", a);
        analyzers.put("fre", a);
        analyzers.put("fr", a);
    }

    /*
     * Note: There are a lot of language codes for German variants that
     * might be useful here.
     */
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new GermanAnalyzer(Version.LUCENE_CURRENT)
                        : new GermanAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("deu", a);
        analyzers.put("ger", a);
        analyzers.put("de", a);
    }

    // Note: ancient greek has a different code (grc).
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new GreekAnalyzer(Version.LUCENE_CURRENT)
                        : new GreekAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("gre", a);
        analyzers.put("ell", a);
        analyzers.put("el", a);
    }

    // @todo what about other Cyrillic scripts?
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new RussianAnalyzer(Version.LUCENE_CURRENT)
                        : new RussianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("rus", a);
        analyzers.put("ru", a);
    }

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return new ThaiAnalyzer(Version.LUCENE_CURRENT);
            }
        };
        analyzers.put("tha", a);
        analyzers.put("th", a);
    }

    // English
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new StandardAnalyzer(Version.LUCENE_CURRENT)
                        : new StandardAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("eng", a);
        analyzers.put("en", a);
        /*
         * Note: There MUST be an entry under the empty string (""). This
         * entry will be requested when there is no entry for the specified
         * language code.
         */
        analyzers.put("", a);
    }

    return analyzers;

}

From source file:com.bizosys.hsearch.inpipe.TokenizeNonEnglish.java

License:Apache License

public void init(Configuration conf) throws ApplicationFault, SystemFault {
    languageMap.put("br", new BrazilianAnalyzer(LuceneConstants.version));
    languageMap.put("cz", new CzechAnalyzer(LuceneConstants.version));
    languageMap.put("nl", new DutchAnalyzer(LuceneConstants.version));
    languageMap.put("fr", new FrenchAnalyzer(LuceneConstants.version));
    languageMap.put("de", new GermanAnalyzer(LuceneConstants.version));
    languageMap.put("el", new GreekAnalyzer(LuceneConstants.version));
    languageMap.put("ru", new RussianAnalyzer(LuceneConstants.version));
    languageMap.put("th", new ThaiAnalyzer(LuceneConstants.version));
}

From source file:com.zimbra.extension.analyzer.th.ThaiAnalyzerExtension.java

License:Open Source License

@Override
public synchronized void init() {
    sLog.info("Initializing " + getName());
    try {/*from  w ww .j  ava2s  .  c om*/
        // The extension can provide any name, however that name must be
        // unique or else the registration will fail.
        ZimbraAnalyzer.registerAnalyzer(getName(), new ThaiAnalyzer(Version.LUCENE_35));
    } catch (ServiceException e) {
        sLog.error("Error while registering extension " + getName(), e);
    }
}

From source file:Example.lucene.HelloLucene.java

public static void main(String[] args) throws IOException, ParseException {
    // 0. Specify the analyzer for tokenizing text.
    //    The same analyzer should be used for indexing and searching
    Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45);

    // 1. create the index
    Directory index = FSDirectory.open(new File("indexing"));

    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, analyzer);
    try (IndexWriter w = new IndexWriter(index, config)) {
        addDoc(w, "Lucene in Action 123.456", "193398817");
        addDoc(w, "Lucene for Dummies 123 456", "55320055Z");
        addDoc(w, "Managing Gigabytes 123456", "55063554A");
        addDoc(w, "", "9900333X");
        addDoc(w,/*  w  w w.  j a v  a  2  s .  c o m*/
                "?",
                "9900333X");
    }

    // 2. query
    String querystr = args.length > 0 ? args[0] : "";

    // the "title" arg specifies the default field to use
    // when no field is explicitly specified in the query.
    Query q = new QueryParser(Version.LUCENE_45, "title", analyzer).parse(querystr);

    // 3. search
    int hitsPerPage = 10;
    try (IndexReader reader = DirectoryReader.open(index)) {
        IndexSearcher searcher = new IndexSearcher(reader);
        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
        searcher.search(q, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;

        // 4. display results
        System.out.println("Found " + hits.length + " hits.");
        for (int i = 0; i < hits.length; ++i) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
        }
    }
}

From source file:Example.lucene.ReadIndex.java

public static void main(String[] args) throws IOException, ParseException {

    Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45);

    Directory index = FSDirectory.open(new File("data/indexing"));

    // 2. query/*from  w  w w . ja v a 2  s .c  o m*/
    //String querystr = args.length > 0 ? args[0] : "golf user";
    // the "title" arg specifies the default field to use
    // when no field is explicitly specified in the query.
    //Query q = new MultiFieldQueryParser(Version.LUCENE_45, new String[] {"content"}, analyzer).parse(querystr);
    //IndexReader indexReader = IndexReader.open(path);
    IndexReader reader = DirectoryReader.open(index);
    //IndexSearcher searcher = new IndexSearcher(reader);

    //Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("content");
    //TermsEnum te = terms.iterator(TermsEnum.EMPTY);
    Fields fields = MultiFields.getFields(reader);
    Terms terms = fields.terms("content");
    TermsEnum iterator = terms.iterator(null);
    BytesRef byteRef;
    while ((byteRef = iterator.next()) != null) {
        String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
        int docFreq = iterator.docFreq();
        System.out.println(term + " " + docFreq);
    }
}

From source file:Example.lucene.SearchNHilight.java

public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException {
    //... Above, create documents with two fields, one with term vectors (tv) and one without (notv)
    Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45);

    Directory index = FSDirectory.open(new File("data/indexing"));
    String querystr = args.length > 0 ? args[0] : "golf user";
    // the "title" arg specifies the default field to use
    // when no field is explicitly specified in the query.
    Query query = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer)
            .parse(querystr);//from   w  w w.j  a v  a 2s.co  m

    // 3. search
    int hitsPerPage = 10;
    IndexReader reader = DirectoryReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);

    TopDocs hits = searcher.search(query, 10);

    SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
    Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
    String Preview;
    for (int i = 0; i < 10; i++) {
        int id = hits.scoreDocs[i].doc;
        Document doc = searcher.doc(id);
        String text;
        Preview = "";
        System.out.println(doc.get("url"));
        System.out.println(doc.get("title"));
        text = doc.get("content");
        TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content",
                analyzer);
        TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
        int k = 0;
        for (TextFragment frag1 : frag) {
            if ((frag1 != null) && (frag1.getScore() > 0)) {
                Preview += (frag1.toString()) + "...<br>";
                k++;
                // Get 2 Line Preview
                if (k >= 2)
                    break;
            }
        }
        //Term vector
        System.out.println("-------------");
    }
}

From source file:Example.lucene.TestIndexer.java

public static void main(String[] args) throws IOException, ParseException {
    // 0. Specify the analyzer for tokenizing text.
    //    The same analyzer should be used for indexing and searching
    Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45);
    String InDirName = "data/test_snipped";
    File InDir = new File(InDirName);

    // 1. create the index
    Directory index = FSDirectory.open(new File("data/indexingonly"));

    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, analyzer);
    try (IndexWriter w = new IndexWriter(index, config)) {
        String[] s;/*from   w ww  .ja  va2s  . c  om*/
        int id = 1;
        for (File f : InDir.listFiles()) {
            try (ArcReader ar = new ArcReader(f)) {
                System.out.println(f.getName());
                while (ar.Next()) {
                    s = ar.Record.ArchiveContent.split("\n");
                    switch (s.length) {
                    case 2:
                        addDoc(w, id++, ar.Record.URL, s[0], s[1]);
                        break;
                    case 1:
                        addDoc(w, id++, ar.Record.URL, s[0], "");
                        break;
                    default:
                        break;
                    }
                }
            }
        }
    }

    // 2. query
    String querystr = args.length > 0 ? args[0] : "";

    // the "title" arg specifies the default field to use
    // when no field is explicitly specified in the query.
    Query q = new QueryParser(Version.LUCENE_45, "title", analyzer).parse(querystr);

    // 3. search
    int hitsPerPage = 10;
    try (IndexReader reader = DirectoryReader.open(index)) {
        IndexSearcher searcher = new IndexSearcher(reader);

        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
        searcher.search(q, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;

        // 4. display results
        System.out.println("Found " + hits.length + " hits.");
        for (int i = 0; i < hits.length; ++i) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            System.out.println((i + 1) + ". " + d.get("id") + "\t" + d.get("url") + "\t" + d.get("title") + "\t"
                    + d.get("content"));
        }
    }
}

From source file:Example.lucene.TestSearch.java

public static void main(String[] args) throws ParseException, IOException {

    Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45);

    Directory index = FSDirectory.open(new File("data/indexingonly"));

    // 2. query/*from ww  w  . j a v  a 2  s . c o m*/
    String querystr = args.length > 0 ? args[0] : "golf user";

    // the "title" arg specifies the default field to use
    // when no field is explicitly specified in the query.
    Query q = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer)
            .parse(querystr);

    // 3. search
    int hitsPerPage = 10;
    try (IndexReader reader = DirectoryReader.open(index)) {
        IndexSearcher searcher = new IndexSearcher(reader);

        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
        searcher.search(q, collector);

        TopDocs td = collector.topDocs(5);
        ScoreDoc[] hits = td.scoreDocs;

        // 4. display results
        System.out.println("Found " + hits.length + " hits. from " + td.totalHits + " docs.");
        for (int i = 0; i < hits.length; ++i) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            System.out.println((i + 1) + ". " + d.get("id") + "\t" + d.get("url") + "\t" + d.get("title") + "\t"
                    + d.get("content"));

        }
    }
}

From source file:Main.WebAPI.Search.java

/**
 * //from   w w  w .  ja v  a  2  s  . co m
 * @param args args[0] is a query
 * 
 * @throws IOException
 * @throws ParseException
 * @throws InvalidTokenOffsetsException 
 */

public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException {
    //... Above, create documents with two fields, one with term vectors (tv) and one without (notv)
    Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45);

    Directory index = FSDirectory.open(new File("data/indexing"));
    String querystr = args.length > 0 ? args[0] : "mike lab";
    // the "title" arg specifies the default field to use
    // when no field is explicitly specified in the query.
    Query query = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer)
            .parse(querystr);

    // 3. search
    int hitsPerPage = 10;
    IndexReader reader = DirectoryReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);

    TopDocs hits = searcher.search(query, 10);

    SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();
    Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
    String Preview;
    for (int i = 0; i < 10; i++) {
        int id = hits.scoreDocs[i].doc;
        Document doc = searcher.doc(id);
        String text;
        Preview = "";
        System.out.println(doc.get("url"));
        System.out.println(doc.get("title"));
        text = doc.get("content");
        TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content",
                analyzer);
        TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "...");
        int k = 0;
        for (TextFragment frag1 : frag) {
            if ((frag1 != null) && (frag1.getScore() > 0)) {
                Preview += (frag1.toString()) + "...<br>";
                k++;
                // Get 2 Line Preview
                if (k >= 2)
                    break;
            }
        }
        //Term vector
        System.out.println("-------------");
    }
}

From source file:org.apache.jackrabbit.core.query.lucene.LanguageCustomizingAnalyzerRegistry.java

License:Open Source License

public LanguageCustomizingAnalyzerRegistry(IndexingConfiguration configuration) {
    this.configuration = configuration;

    languageToAnalyzer.put("ar", new AnalyzerWrapper(new ArabicAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("br", new AnalyzerWrapper(new BrazilianAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("cjk", new AnalyzerWrapper(new CJKAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("cn", new AnalyzerWrapper(new ChineseAnalyzer(), true));
    languageToAnalyzer.put("cz", new AnalyzerWrapper(new CzechAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("de", new AnalyzerWrapper(new GermanAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("el", new AnalyzerWrapper(new GreekAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("en", new AnalyzerWrapper(
            new SnowballAnalyzer(Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET), true));
    languageToAnalyzer.put("fa", new AnalyzerWrapper(new PersianAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("fr", new AnalyzerWrapper(new FrenchAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("nl", new AnalyzerWrapper(new DutchAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("ru", new AnalyzerWrapper(new RussianAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("th", new AnalyzerWrapper(new ThaiAnalyzer(Version.LUCENE_30), true));
}