Java tutorial
/** * Copyright 2009 Welocalize, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.globalsight.ling.tm2.lucene; import java.io.IOException; import java.io.StringReader; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.queryparser.classic.QueryParser.Operator; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.BooleanClause.Occur; import com.globalsight.ling.lucene.GSAttribute; import com.globalsight.ling.lucene.IndexDocument; import com.globalsight.util.GlobalSightLocale; /** * Wrapper of Lucene Document object for Tm segment */ class TuvDocument { // field names public static final String TEXT_FIELD = "text"; public static final String TUV_ID_FIELD = "tuv_id"; public static final String TU_ID_FIELD = "tu_id"; public static final String TM_ID_FIELD = "tm_id"; public static final String IS_SOURCE_FIELD = "is_source"; public static final String TOKEN_COUNT_FIELD = "token_count"; public static final String TARGET_LOCALES_FIELD = "target_locales"; private Document m_document = null; // cache values of each Field in m_document private String m_text = null; private Long m_tuvId = null; private Long m_tuId = null; private Long m_tmId = null; private Boolean m_isSourceLocale = null; // used by TM3, null for TM2 private Set<String> m_targetLocales = null; private Integer m_totalTokenCount = null; public TuvDocument(String text, long tuvId, long tuId, long tmId, boolean isSourceLocale, Set<String> targetLocales, Analyzer analyzer) throws Exception { m_text = text; m_tuvId = new Long(tuvId); m_tuId = new Long(tuId); m_tmId = new Long(tmId); m_isSourceLocale = new Boolean(isSourceLocale); m_targetLocales = targetLocales; m_totalTokenCount = new Integer(getTotalTokenCount(text, analyzer)); m_document = createDocument(); } public TuvDocument(Document document) { m_document = document; } public Document getDocument() { return m_document; } public long getTuvId() throws Exception { if (m_tuvId == null) { String idStr = m_document.get(TUV_ID_FIELD); m_tuvId = new Long(idStr); } return m_tuvId.longValue(); } public Long getTuIdAsLong() throws Exception { if (m_tuId == null) { String idStr = m_document.get(TU_ID_FIELD); m_tuId = new Long(idStr); } return m_tuId; } public long getTuId() throws Exception { return getTuIdAsLong().longValue(); } public long getTmId() throws Exception { if (m_tmId == null) { String idStr = m_document.get(TM_ID_FIELD); m_tmId = new Long(idStr); } return m_tmId.longValue(); } public Long getTmIdAsLong() throws Exception { if (m_tmId == null) { getTmId(); } return m_tmId; } public int getTotalTokenCount() throws Exception { if (m_totalTokenCount == null) { String idStr = m_document.get(TOKEN_COUNT_FIELD); m_totalTokenCount = new Integer(idStr); } return m_totalTokenCount.intValue(); } public boolean isSourceLocale() throws Exception { if (m_isSourceLocale == null) { String idStr = m_document.get(IS_SOURCE_FIELD); m_isSourceLocale = new Boolean(idStr); } return m_isSourceLocale.booleanValue(); } private int getTotalTokenCount(String text, Analyzer analyzer) throws Exception { TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(text)); tokenStream.reset(); int tokenCount = 0; while (tokenStream.incrementToken()) { tokenCount++; } return tokenCount; } private Document createDocument() { Document doc = new Document(); FieldType ft; Field field; // text field. not stored, indexed, tokenized. ft = new FieldType(); ft.setStored(false); ft.setIndexed(true); ft.setTokenized(true); field = new Field(TEXT_FIELD, m_text, ft);// false, true, true); doc.add(field); // Tuv id field. stored, indexed, not tokenized. ft = new FieldType(); ft.setStored(true); ft.setIndexed(true); ft.setTokenized(false); field = new Field(TUV_ID_FIELD, m_tuvId.toString(), ft);// true, true, false); doc.add(field); // Tu id field. stored, not indexed, not tokenized. ft = new FieldType(); ft.setStored(true); ft.setIndexed(false); ft.setTokenized(false); field = new Field(TU_ID_FIELD, m_tuId.toString(), ft);//true, false, false); doc.add(field); // TM id field. stored, not indexed, not tokenized. ft = new FieldType(); ft.setStored(true); ft.setIndexed(false); ft.setTokenized(false); field = new Field(TM_ID_FIELD, m_tmId.toString(), ft);//true, false, false); doc.add(field); // Token count field. stored, not indexed, not tokenized. ft = new FieldType(); ft.setStored(true); ft.setIndexed(false); ft.setTokenized(false); field = new Field(TOKEN_COUNT_FIELD, m_totalTokenCount.toString(), ft);//true, false, false); doc.add(field); // Is source field. stored, not indexed, not tokenized. ft = new FieldType(); ft.setStored(true); ft.setIndexed(false); ft.setTokenized(false); field = new Field(IS_SOURCE_FIELD, m_isSourceLocale.toString(), ft);// true, false, false); doc.add(field); // target locales field. not stored, indexed, tokenized. if (m_targetLocales != null) { StringBuilder locs = new StringBuilder(); for (String locale : m_targetLocales) { locs.append(locale); locs.append(' '); } ft = new FieldType(); ft.setStored(false); ft.setIndexed(true); ft.setTokenized(true); field = new Field(TARGET_LOCALES_FIELD, locs.toString(), ft);//false, true, true); doc.add(field); } return doc; } /** * Create a query for TuvDocuments, optionally filtered by target locale. * * @param analyzer An analyzer contructed by makeAnalyzer * @param query A Lucene query for the text * @param targetLocale targetLocale filter on target locale (TM3) (null * for TM2) * @throws IOException */ public static Query makeQuery(Analyzer analyzer, String query, GlobalSightLocale targetLocale) throws ParseException, IOException { //escape reserved word of Lucene, // + - & | ! ( ) { } [ ] ^ ~ * ? : \ query = replaceReservedWordForLucenne(query); query = QueryParser.escape(query); query = TEXT_FIELD + ":\"" + query + "\""; if (targetLocale != null) { // from TM3 if ("".equals(query.trim())) { return null; } query = query + " AND "; query = query + TARGET_LOCALES_FIELD + ":" + targetLocale.toString(); } else { // From TM2 if ("".equals(query.trim())) { return null; } } QueryParser qp = new QueryParser(LuceneUtil.VERSION, TEXT_FIELD, analyzer); Query qq = qp.parse(query); return qq; } /** * Fix the reserved word for Lucene by leon * * @param pattern * @return */ private static String replaceReservedWordForLucenne(String pattern) { //For AND, OR, NOT pattern = replace(pattern, "AND"); pattern = replace(pattern, "OR"); pattern = replace(pattern, "NOT"); return pattern.trim(); } /** * Replace AND OR NOT * * @param pattern * @param replaceStr * @return */ private static String replace(String pattern, String replaceStr) { while (pattern.indexOf(" " + replaceStr + " ") > 0) { pattern = pattern.replace(" " + replaceStr + " ", " "); } if (pattern.startsWith(replaceStr + " ")) { pattern = pattern.substring(replaceStr.length(), pattern.length()); } if (pattern.endsWith(" " + replaceStr)) { pattern = pattern.substring(0, pattern.length() - replaceStr.length()); } if (pattern.trim().equals(replaceStr)) { pattern = ""; } return pattern; } }