Java tutorial
/** * * APDPlat - Application Product Development Platform * Copyright (c) 2013, ??, yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.word.lucene; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.apdplat.word.util.Utils; import org.apdplat.word.util.WordConfTools; import org.junit.Test; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; /** * * @author ?? */ public class ChineseWordAnalyzerTest { @Test public void test1() { try { Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??APDPlat???"); List<String> words = new ArrayList<>(); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); words.add(charTermAttribute.toString()); } tokenStream.close(); String expResult = "[??, , apdplat, , ?, ?, ?, , ]"; if ("bigram".equals(WordConfTools.get("ngram", "bigram"))) { expResult = "[??, , apdplat, , , ?, ?, ?, , ]"; } assertEquals(expResult, words.toString()); } catch (IOException e) { fail("?" + e.getMessage()); } } @Test public void test2() { try { Analyzer analyzer = new ChineseWordAnalyzer(); TokenStream tokenStream = analyzer.tokenStream("text", "??"); List<String> words = new ArrayList<>(); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); words.add(charTermAttribute.toString()); } tokenStream.close(); String expResult = "[??, , , , , , ]"; assertEquals(expResult, words.toString()); } catch (IOException e) { fail("?" + e.getMessage()); } } @Test public void test3() { Analyzer analyzer = new ChineseWordAnalyzer(); List<String> sentences = new ArrayList<>(); sentences.add("??APDPlat???"); sentences.add("?"); sentences.add("??"); sentences.add("?"); sentences.add("??"); sentences.add("????"); sentences.add("?"); sentences.add(""); sentences.add("????"); sentences.add("?"); sentences.add("?"); sentences.add("??"); sentences.add(""); sentences.add("?,?"); sentences.add("???"); sentences.add(""); sentences.add("?"); sentences.add("????"); sentences.add(""); sentences.add("????"); sentences.add("??"); sentences.add("?"); sentences.add("?"); sentences.add("?"); sentences.add("????"); sentences.add("?"); sentences.add("?"); sentences.add("?"); sentences.add("?"); sentences.add("?"); sentences.add("???"); sentences.add("?"); sentences.add("?"); sentences.add(""); sentences.add("?"); sentences.add("??"); sentences.add("?"); sentences.add(""); sentences.add(""); sentences.add("?"); sentences.add("????"); sentences.add("word????????ysc"); IndexWriterConfig config = new IndexWriterConfig(analyzer); config.setUseCompoundFile(false); File index = new File("target/indexes"); Utils.deleteDir(index); try (Directory directory = new SimpleFSDirectory(index.toPath()); IndexWriter indexWriter = new IndexWriter(directory, config)) { for (String sentence : sentences) { Document doc = new Document(); Field field = new TextField("text", sentence, Field.Store.YES); doc.add(field); indexWriter.addDocument(doc); } indexWriter.commit(); } catch (Exception e) { e.printStackTrace(); fail("" + e.getMessage()); } try (Directory directory = new SimpleFSDirectory(index.toPath()); DirectoryReader directoryReader = DirectoryReader.open(directory)) { IndexSearcher indexSearcher = new IndexSearcher(directoryReader); QueryParser queryParser = new QueryParser("text", analyzer); Query query = queryParser.parse("text:??"); TopDocs docs = indexSearcher.search(query, Integer.MAX_VALUE); assertEquals(2, docs.totalHits); assertEquals("word????????ysc", indexSearcher.doc(docs.scoreDocs[0].doc).get("text")); assertEquals("??APDPlat???", indexSearcher.doc(docs.scoreDocs[1].doc).get("text")); query = queryParser.parse("text:"); docs = indexSearcher.search(query, Integer.MAX_VALUE); assertEquals(1, docs.totalHits); assertEquals("?", indexSearcher.doc(docs.scoreDocs[0].doc).get("text")); } catch (Exception e) { fail("?" + e.getMessage()); } } }