List of usage examples for org.apache.lucene.analysis.th ThaiAnalyzer ThaiAnalyzer
public ThaiAnalyzer(CharArraySet stopwords)
From source file:com.bigdata.search.DefaultAnalyzerFactory.java
License:Open Source License
/** * Initializes the various kinds of analyzers that we know about. * <p>/*from www .j a v a2 s.c o m*/ * Note: Each {@link Analyzer} is registered under both the 3 letter and the * 2 letter language codes. See <a * href="http://www.loc.gov/standards/iso639-2/php/code_list.php">ISO 639-2</a>. * * @todo get some informed advice on which {@link Analyzer}s map onto which * language codes. * * @todo thread safety? Analyzers produce token processors so maybe there is * no problem here once things are initialized. If so, maybe this * could be static. * * @todo configuration. Could be configured by a file containing a class * name and a list of codes that are handled by that class. * * @todo strip language code down to 2/3 characters during lookup. * * @todo There are a lot of pidgins based on french, english, and other * languages that are not being assigned here. */ synchronized private Map<String, AnalyzerConstructor> getAnalyzers() { if (analyzers != null) { return analyzers; } analyzers = new HashMap<String, AnalyzerConstructor>(); final Set<?> emptyStopwords = Collections.EMPTY_SET; { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new BrazilianAnalyzer(Version.LUCENE_CURRENT) : new BrazilianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("por", a); analyzers.put("pt", a); } /* * Claims to handle Chinese. Does single character extraction. Claims to * produce smaller indices as a result. * * Note: you can not tokenize with the Chinese analyzer and the do * search using the CJK analyzer and visa versa. * * Note: I have no idea whether this would work for Japanese and Korean * as well. I expect so, but no real clue. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ChineseAnalyzer(); } }; analyzers.put("zho", a); analyzers.put("chi", a); analyzers.put("zh", a); } /* * Claims to handle Chinese, Japanese, Korean. Does double character * extraction with overlap. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CJKAnalyzer(Version.LUCENE_CURRENT) : new CJKAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; // analyzers.put("zho", a); // analyzers.put("chi", a); // analyzers.put("zh", a); analyzers.put("jpn", a); analyzers.put("ja", a); analyzers.put("jpn", a); analyzers.put("kor", a); analyzers.put("ko", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CzechAnalyzer(Version.LUCENE_CURRENT) : new CzechAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("ces", a); analyzers.put("cze", a); analyzers.put("cs", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new DutchAnalyzer(Version.LUCENE_CURRENT) : new DutchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("dut", a); analyzers.put("nld", a); analyzers.put("nl", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new FrenchAnalyzer(Version.LUCENE_CURRENT) : new FrenchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("fra", a); analyzers.put("fre", a); analyzers.put("fr", a); } /* * Note: There are a lot of language codes for German variants that * might be useful here. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GermanAnalyzer(Version.LUCENE_CURRENT) : new GermanAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("deu", a); analyzers.put("ger", a); analyzers.put("de", a); } // Note: ancient greek has a different code (grc). { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GreekAnalyzer(Version.LUCENE_CURRENT) : new GreekAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("gre", a); analyzers.put("ell", a); analyzers.put("el", a); } // @todo what about other Cyrillic scripts? { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new RussianAnalyzer(Version.LUCENE_CURRENT) : new RussianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("rus", a); analyzers.put("ru", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ThaiAnalyzer(Version.LUCENE_CURRENT); } }; analyzers.put("tha", a); analyzers.put("th", a); } // English { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new StandardAnalyzer(Version.LUCENE_CURRENT) : new StandardAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("eng", a); analyzers.put("en", a); /* * Note: There MUST be an entry under the empty string (""). This * entry will be requested when there is no entry for the specified * language code. */ analyzers.put("", a); } return analyzers; }
From source file:com.bizosys.hsearch.inpipe.TokenizeNonEnglish.java
License:Apache License
public void init(Configuration conf) throws ApplicationFault, SystemFault { languageMap.put("br", new BrazilianAnalyzer(LuceneConstants.version)); languageMap.put("cz", new CzechAnalyzer(LuceneConstants.version)); languageMap.put("nl", new DutchAnalyzer(LuceneConstants.version)); languageMap.put("fr", new FrenchAnalyzer(LuceneConstants.version)); languageMap.put("de", new GermanAnalyzer(LuceneConstants.version)); languageMap.put("el", new GreekAnalyzer(LuceneConstants.version)); languageMap.put("ru", new RussianAnalyzer(LuceneConstants.version)); languageMap.put("th", new ThaiAnalyzer(LuceneConstants.version)); }
From source file:com.zimbra.extension.analyzer.th.ThaiAnalyzerExtension.java
License:Open Source License
@Override public synchronized void init() { sLog.info("Initializing " + getName()); try {/*from w ww .j ava2s . c om*/ // The extension can provide any name, however that name must be // unique or else the registration will fail. ZimbraAnalyzer.registerAnalyzer(getName(), new ThaiAnalyzer(Version.LUCENE_35)); } catch (ServiceException e) { sLog.error("Error while registering extension " + getName(), e); } }
From source file:Example.lucene.HelloLucene.java
public static void main(String[] args) throws IOException, ParseException { // 0. Specify the analyzer for tokenizing text. // The same analyzer should be used for indexing and searching Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45); // 1. create the index Directory index = FSDirectory.open(new File("indexing")); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, analyzer); try (IndexWriter w = new IndexWriter(index, config)) { addDoc(w, "Lucene in Action 123.456", "193398817"); addDoc(w, "Lucene for Dummies 123 456", "55320055Z"); addDoc(w, "Managing Gigabytes 123456", "55063554A"); addDoc(w, "", "9900333X"); addDoc(w,/* w w w. j a v a 2 s . c o m*/ "?", "9900333X"); } // 2. query String querystr = args.length > 0 ? args[0] : ""; // the "title" arg specifies the default field to use // when no field is explicitly specified in the query. Query q = new QueryParser(Version.LUCENE_45, "title", analyzer).parse(querystr); // 3. search int hitsPerPage = 10; try (IndexReader reader = DirectoryReader.open(index)) { IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. display results System.out.println("Found " + hits.length + " hits."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title")); } } }
From source file:Example.lucene.ReadIndex.java
public static void main(String[] args) throws IOException, ParseException { Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45); Directory index = FSDirectory.open(new File("data/indexing")); // 2. query/*from w w w . ja v a 2 s .c o m*/ //String querystr = args.length > 0 ? args[0] : "golf user"; // the "title" arg specifies the default field to use // when no field is explicitly specified in the query. //Query q = new MultiFieldQueryParser(Version.LUCENE_45, new String[] {"content"}, analyzer).parse(querystr); //IndexReader indexReader = IndexReader.open(path); IndexReader reader = DirectoryReader.open(index); //IndexSearcher searcher = new IndexSearcher(reader); //Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("content"); //TermsEnum te = terms.iterator(TermsEnum.EMPTY); Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms("content"); TermsEnum iterator = terms.iterator(null); BytesRef byteRef; while ((byteRef = iterator.next()) != null) { String term = new String(byteRef.bytes, byteRef.offset, byteRef.length); int docFreq = iterator.docFreq(); System.out.println(term + " " + docFreq); } }
From source file:Example.lucene.SearchNHilight.java
public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException { //... Above, create documents with two fields, one with term vectors (tv) and one without (notv) Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45); Directory index = FSDirectory.open(new File("data/indexing")); String querystr = args.length > 0 ? args[0] : "golf user"; // the "title" arg specifies the default field to use // when no field is explicitly specified in the query. Query query = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer) .parse(querystr);//from w w w.j a v a 2s.co m // 3. search int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopDocs hits = searcher.search(query, 10); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); String Preview; for (int i = 0; i < 10; i++) { int id = hits.scoreDocs[i].doc; Document doc = searcher.doc(id); String text; Preview = ""; System.out.println(doc.get("url")); System.out.println(doc.get("title")); text = doc.get("content"); TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content", analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); int k = 0; for (TextFragment frag1 : frag) { if ((frag1 != null) && (frag1.getScore() > 0)) { Preview += (frag1.toString()) + "...<br>"; k++; // Get 2 Line Preview if (k >= 2) break; } } //Term vector System.out.println("-------------"); } }
From source file:Example.lucene.TestIndexer.java
public static void main(String[] args) throws IOException, ParseException { // 0. Specify the analyzer for tokenizing text. // The same analyzer should be used for indexing and searching Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45); String InDirName = "data/test_snipped"; File InDir = new File(InDirName); // 1. create the index Directory index = FSDirectory.open(new File("data/indexingonly")); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, analyzer); try (IndexWriter w = new IndexWriter(index, config)) { String[] s;/*from w ww .ja va2s . c om*/ int id = 1; for (File f : InDir.listFiles()) { try (ArcReader ar = new ArcReader(f)) { System.out.println(f.getName()); while (ar.Next()) { s = ar.Record.ArchiveContent.split("\n"); switch (s.length) { case 2: addDoc(w, id++, ar.Record.URL, s[0], s[1]); break; case 1: addDoc(w, id++, ar.Record.URL, s[0], ""); break; default: break; } } } } } // 2. query String querystr = args.length > 0 ? args[0] : ""; // the "title" arg specifies the default field to use // when no field is explicitly specified in the query. Query q = new QueryParser(Version.LUCENE_45, "title", analyzer).parse(querystr); // 3. search int hitsPerPage = 10; try (IndexReader reader = DirectoryReader.open(index)) { IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. display results System.out.println("Found " + hits.length + " hits."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println((i + 1) + ". " + d.get("id") + "\t" + d.get("url") + "\t" + d.get("title") + "\t" + d.get("content")); } } }
From source file:Example.lucene.TestSearch.java
public static void main(String[] args) throws ParseException, IOException { Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45); Directory index = FSDirectory.open(new File("data/indexingonly")); // 2. query/*from ww w . j a v a 2 s . c o m*/ String querystr = args.length > 0 ? args[0] : "golf user"; // the "title" arg specifies the default field to use // when no field is explicitly specified in the query. Query q = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer) .parse(querystr); // 3. search int hitsPerPage = 10; try (IndexReader reader = DirectoryReader.open(index)) { IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); searcher.search(q, collector); TopDocs td = collector.topDocs(5); ScoreDoc[] hits = td.scoreDocs; // 4. display results System.out.println("Found " + hits.length + " hits. from " + td.totalHits + " docs."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println((i + 1) + ". " + d.get("id") + "\t" + d.get("url") + "\t" + d.get("title") + "\t" + d.get("content")); } } }
From source file:Main.WebAPI.Search.java
/** * //from w w w . ja v a 2 s . co m * @param args args[0] is a query * * @throws IOException * @throws ParseException * @throws InvalidTokenOffsetsException */ public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException { //... Above, create documents with two fields, one with term vectors (tv) and one without (notv) Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45); Directory index = FSDirectory.open(new File("data/indexing")); String querystr = args.length > 0 ? args[0] : "mike lab"; // the "title" arg specifies the default field to use // when no field is explicitly specified in the query. Query query = new MultiFieldQueryParser(Version.LUCENE_45, new String[] { "content" }, analyzer) .parse(querystr); // 3. search int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopDocs hits = searcher.search(query, 10); SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); String Preview; for (int i = 0; i < 10; i++) { int id = hits.scoreDocs[i].doc; Document doc = searcher.doc(id); String text; Preview = ""; System.out.println(doc.get("url")); System.out.println(doc.get("title")); text = doc.get("content"); TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "content", analyzer); TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);//highlighter.getBestFragments(tokenStream, text, 3, "..."); int k = 0; for (TextFragment frag1 : frag) { if ((frag1 != null) && (frag1.getScore() > 0)) { Preview += (frag1.toString()) + "...<br>"; k++; // Get 2 Line Preview if (k >= 2) break; } } //Term vector System.out.println("-------------"); } }
From source file:org.apache.jackrabbit.core.query.lucene.LanguageCustomizingAnalyzerRegistry.java
License:Open Source License
public LanguageCustomizingAnalyzerRegistry(IndexingConfiguration configuration) { this.configuration = configuration; languageToAnalyzer.put("ar", new AnalyzerWrapper(new ArabicAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("br", new AnalyzerWrapper(new BrazilianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("cjk", new AnalyzerWrapper(new CJKAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("cn", new AnalyzerWrapper(new ChineseAnalyzer(), true)); languageToAnalyzer.put("cz", new AnalyzerWrapper(new CzechAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("de", new AnalyzerWrapper(new GermanAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("el", new AnalyzerWrapper(new GreekAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("en", new AnalyzerWrapper( new SnowballAnalyzer(Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET), true)); languageToAnalyzer.put("fa", new AnalyzerWrapper(new PersianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("fr", new AnalyzerWrapper(new FrenchAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("nl", new AnalyzerWrapper(new DutchAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("ru", new AnalyzerWrapper(new RussianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("th", new AnalyzerWrapper(new ThaiAnalyzer(Version.LUCENE_30), true)); }