List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:cn.com.szgao.enterprise.ExecutorsIndustry.java
/** * //from w w w .j av a 2 s. c o m * * @param enterVO * @return */ public static EnterpriseVO washEnterpriseVO(EnterpriseVO enterVO) throws IOException { if (!StringUtils.isNull(enterVO.getCompany())) { enterVO.setCompany(deleteMoreFuhao(enterVO.getCompany())); } // ?? String scope = enterVO.getScope(); if (!StringUtils.isNull(scope)) { scope = scope.replaceAll("[ \r\t^]", ""); if (scope.length() > 5) { String temp = ""; // System.out.println(scope); // if (scope.substring(scope.length() - 5).contains("^")) { // temp = scope.substring(scope.length() - 5).replace("^", ""); // scope = scope.substring(0, scope.length() - 5) + temp; // } else { temp = scope.substring(scope.length() - 5).replaceAll("[#*+]", ""); scope = scope.substring(0, scope.length() - 5) + temp; // } } enterVO.setScope(scope); } String regCapital = enterVO.getRegCapital(); // ?   if (!StringUtils.isNull(regCapital)) { String capital = enterVO.getRegCapital().replaceAll("[ \r\t\n]", ""); // enterVO.setRegCapital(capital); enterVO.setRegCapitalO(capital); Double regCapitalN = getBankingDouble(capital); if (null != regCapitalN) { enterVO.setRegCapitalN(regCapitalN); enterVO.setRegCapital(null); if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit("?"); } else if (regCapital.contains("?")) { enterVO.setUnit("??"); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("?")) { enterVO.setUnit("?"); } else if (regCapital.contains("?")) { enterVO.setUnit("?"); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("?")) { enterVO.setUnit("?"); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else { enterVO.setUnit("?"); } } } // ? String industry = null; String industryId = null; /* * if (!StringUtils.isNull(enterVO.getCompany())) { String key = * StringUtils.NBG.generate(enterVO.getCompany()).toString(); * JsonDocument queryDoc = ExecutorsText.bucket.get(key, 60, * TimeUnit.MINUTES); * * if (null == queryDoc) { // System.out.println( * "NULL queryDoc-------------------- " + // key); // * System.out.println("NULL queryDoc-------------------- " + // key); * * } else { BusinessDirectoryVO vo = * StringUtils.GSON.fromJson(queryDoc.content().toString(), * BusinessDirectoryVO.class); if * (!StringUtils.isNull(vo.getIndustry())) { industry = * vo.getIndustry(); } } } */ // ? Analyzer anal = new IKAnalyzer(); String indu = removeBlank(enterVO.getScope()); IndustryVO ivo = null; IndustryVO ivo1 = null; if (StringUtils.isNull(industry)) { if (!StringUtils.isNull(indu)) { String[] sourceStrArray = indu.split("[;.:,]");// ? for (String str : sourceStrArray) { // System.out.println("-- " + str); ivo = getIndustry(str); if (!StringUtils.isNull(ivo.getIndustry_id())) { break; } // ik? if (null == ivo.getIndustry_id()) { StringReader reader = new StringReader(str); // ? TokenStream ts = null; ts = anal.tokenStream("", reader); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); try { ts.reset(); String strs = ""; // ???? while (ts.incrementToken()) { String temp = term.toString(); if (!StringUtils.isNull(temp)) { // getIndustry(temp); strs += term.toString() + "|"; // System.out.print(term.toString() + "|" ); } } reader.close(); String[] arrStr1 = strs.split("\\|"); StringUtils.sortStringArray(arrStr1, 1);// ? List<IndustryVO> listiv = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_v = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_n = new ArrayList<IndustryVO>(); for (int i = 0; i < arrStr1.length; i++) { String temp = arrStr1[i]; if (!StringUtils.isNull(temp)) { ivo1 = getIndustry(temp); if (!StringUtils.isNull(ivo1.getIndustry_id())) { listiv.add(ivo1); // break; } } } // ????? if (listiv.size() > 0) { for (IndustryVO industryVO : listiv) { if ("V".equals(industryVO.getFlag())) { listiv_v.add(industryVO); } else { listiv_n.add(industryVO); } } } if (listiv_v.size() > 0) { ivo = getfirstSortStringArray(listiv_v, 1); break; } if (listiv_n.size() > 0) { ivo = getfirstSortStringArray(listiv_n, 1); break; } } catch (IOException e) { e.printStackTrace(); } } if (!StringUtils.isNull(ivo.getIndustry_id())) { break; } } } } if (null != ivo) { enterVO.setIndustry(ivo.getIndustry_name()); enterVO.setIndustryId(ivo.getIndustry_id()); } if (!StringUtils.isNull(enterVO.getRegNum())) { if (enterVO.getRegNum().indexOf("\u0000") != -1) { enterVO.setRegNum(enterVO.getRegNum().substring(0, enterVO.getRegNum().indexOf("\u0000"))); } } enterVO.setHolder(WashEtp.clearHolder(enterVO));// enterVO.setPunishment(WashEtp.clearPunishment(enterVO));// ? enterVO.setChange(WashEtp.clearChangeItem(enterVO));// ? enterVO.setReport(WashEtp.clearReport(enterVO));// return enterVO; }
From source file:cn.com.szgao.enterprise.FileIntoDataBase2p5.java
/** * ik??//w w w . j a v a 2 s .co m * * @param str * @return * @return IndustryVO * @author liuming * @date 2016629 ?2:36:09 */ public static IndustryVO getIndustryVOFromIk(String str) { // ? Analyzer anal = new IKAnalyzer(); IndustryVO ivo = new IndustryVO(); IndustryVO ivo1 = new IndustryVO(); StringReader reader = new StringReader(str); // ? TokenStream ts = null; ts = anal.tokenStream("", reader); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); try { ts.reset(); String strs = ""; // ???? while (ts.incrementToken()) { String temp = term.toString(); if (!StringUtils.isNull(temp)) { // getIndustry(temp); strs += term.toString() + "|"; // System.out.print(term.toString() + "|" ); } } reader.close(); String[] arrStr1 = strs.split("\\|"); StringUtils.sortStringArray(arrStr1, 1);// ? List<IndustryVO> listiv = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_v = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_n = new ArrayList<IndustryVO>(); for (int i = 0; i < arrStr1.length; i++) { String temp = arrStr1[i]; if (!StringUtils.isNull(temp)) { ivo1 = getIndustry(temp); if (!StringUtils.isNull(ivo1.getIndustry_id())) { listiv.add(ivo1); // break; } } } // ????? // if (listiv.size() > 0) { // for (IndustryVO industryVO : listiv) { // if ("V".equals(industryVO.getFlag())) { // listiv_v.add(industryVO); // } else { // listiv_n.add(industryVO); // } // } // } // ??? if (listiv.size() > 0) { ivo = getfirstSortStringArray(listiv, 1); } // if (listiv_v.size() > 0) { // ivo = getfirstSortStringArray(listiv_v, 1); // } // if (listiv_n.size() > 0) { // ivo = getfirstSortStringArray(listiv_n, 1); // } } catch (IOException e) { e.printStackTrace(); } return ivo; }
From source file:cn.com.szgao.enterprise.FileIntoDataBase2p5WashJson.java
/** * ik??/* w ww . j ava 2 s. co m*/ * * @param str * @return * @return IndustryVO * @author liuming * @date 2016629 ?2:36:09 */ public static IndustryVO getIndustryVOFromIk(String str) { // ? Analyzer anal = new IKAnalyzer(); IndustryVO ivo = new IndustryVO(); IndustryVO ivo1 = new IndustryVO(); StringReader reader = new StringReader(str); // ? TokenStream ts = null; ts = anal.tokenStream("", reader); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); try { ts.reset(); String strs = ""; // ???? while (ts.incrementToken()) { String temp = term.toString(); if (!StringUtils.isNull(temp)) { // getIndustry(temp); strs += term.toString() + "|"; // System.out.print(term.toString() + "|" ); } } reader.close(); String[] arrStr1 = strs.split("\\|"); StringUtils.sortStringArray(arrStr1, 1);// ? List<IndustryVO> listiv = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_v = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_n = new ArrayList<IndustryVO>(); for (int i = 0; i < arrStr1.length; i++) { String temp = arrStr1[i]; if (!StringUtils.isNull(temp)) { ivo1 = getIndustry(temp); if (!StringUtils.isNull(ivo1.getIndustry_id())) { listiv.add(ivo1); // break; } } } // ????? // if (listiv.size() > 0) { // for (IndustryVO industryVO : listiv) { // if ("V".equals(industryVO.getFlag())) { // listiv_v.add(industryVO); // } else { // listiv_n.add(industryVO); // } // } // } // ??? if (listiv.size() > 0) { ivo = getfirstSortStringArray(listiv, 1); } // if (listiv_v.size() > 0) { // ivo = getfirstSortStringArray(listiv_v, 1); // } // if (listiv_n.size() > 0) { // ivo = getfirstSortStringArray(listiv_n, 1); // } } catch (IOException e) { e.printStackTrace(); } return ivo; }
From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java
License:Open Source License
public boolean contain(String label) { try {// ww w . j av a 2 s. co m IndexReader reader = IndexReader.open(this.indexDir, true); Searcher searcher = new IndexSearcher(reader); // use the boolean query HashSet<String> queryTermSet = new HashSet<String>(); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { queryTermSet.add(termAtt.term()); } stream.end(); stream.close(); // construct the query BooleanQuery bq = new BooleanQuery(); Iterator<String> it = queryTermSet.iterator(); while (it.hasNext()) { String s = it.next(); Term term = new Term(LabelDocument.FIELD_LABEL, s); TermQuery termQuery = new TermQuery(term); bq.add(termQuery, Occur.MUST); } ExactLabelQueryResultCollector collector = new ExactLabelQueryResultCollector(reader, label); searcher.search(bq, collector); boolean ret = collector.isExistQueryLabel(); reader.close(); return ret; } catch (Exception e) { e.printStackTrace(); } return false; }
From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java
License:Open Source License
public TreeSet<SimilarLabelQueryResult> getSimilarLabels(String query, float similarity) { TreeSet<SimilarLabelQueryResult> ret = new TreeSet<SimilarLabelQueryResult>(); if (query == null) { ret.add(new SimilarLabelQueryResult(null, 1)); return ret; }/*from www . j av a2s . c o m*/ try { IndexReader reader = IndexReader.open(this.indexDir, true); Searcher searcher = new IndexSearcher(reader); // get terms from query HashSet<String> queryTermSet = new HashSet<String>(); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(query)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { queryTermSet.add(termAtt.term()); } stream.end(); stream.close(); // construct the query BooleanQuery bq = new BooleanQuery(); Iterator<String> it = queryTermSet.iterator(); SynonymMap synMap = SynonymIndex.getSynonymMap(); HashSet<String> expandedQueryTermSet = new HashSet<String>(queryTermSet); while (it.hasNext()) { String s = it.next(); Term term = new Term(LabelDocument.FIELD_LABEL, s); TermQuery termQuery = new TermQuery(term); bq.add(termQuery, Occur.SHOULD); // expand using synonyms for (String syn : synMap.getSynonyms(s)) { stemer.setCurrent(syn); stemer.stem(); syn = stemer.getCurrent(); if (expandedQueryTermSet.add(syn)) { term = new Term(LabelDocument.FIELD_LABEL, syn); termQuery = new TermQuery(term); bq.add(termQuery, Occur.SHOULD); } } } // search in the label index SimilarLabelQueryResultCollector collector = new SimilarLabelQueryResultCollector(reader, queryTermSet, similarity); searcher.search(bq, collector); ret = collector.getQueryResult(); searcher.close(); reader.close(); } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:cn.edu.thss.iise.beehivez.server.index.luceneindex.analyzer.SemicolonAnalyzer.java
License:Open Source License
/** * @param args//from w w w . j a v a 2 s. c o m */ public static void main(String[] args) throws IOException { // text to tokenize final String text = "This is a demo of , the new TokenStream API"; SemicolonAnalyzer analyzer = new SemicolonAnalyzer(); TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); // get the TermAttribute from the TokenStream TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println(termAtt.term()); } stream.end(); stream.close(); }
From source file:cn.edu.thss.iise.beehivez.server.util.StringSimilarityUtil.java
License:Open Source License
/** * tokenize the given string, all the words are extracted, lowercased, all * the stop words are removed, and all the words are replaced with their * stem// ww w. ja va 2 s . co m * * @param label * @return */ public static HashSet<String> snowballTokenize(String label) { HashSet<String> ret = new HashSet<String>(); try { Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English", StandardAnalyzer.STOP_WORDS_SET); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { ret.add(termAtt.term()); } stream.end(); stream.close(); } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:com.antsdb.saltedfish.sql.vdm.LuceneUtil.java
License:Open Source License
static void tokenize(String text, BiConsumer<String, String> lambda) { try (StandardAnalyzer analyzer = new StandardAnalyzer()) { TokenStream stream = analyzer.tokenStream("", text); CharTermAttribute term = stream.getAttribute(CharTermAttribute.class); TypeAttribute type = stream.getAttribute(TypeAttribute.class); stream.reset(); while (stream.incrementToken()) { lambda.accept(type.type(), term.toString()); }//from w w w . j ava2 s .c o m } catch (IOException x) { throw new RuntimeException(x); } }
From source file:com.b2international.index.compat.Highlighting.java
License:Apache License
/** * Splits a string to a list of tokens using the specified Lucene analyzer. * /* ww w.ja v a 2s . c o m*/ * @param analyzer the analyzer determining token boundaries (may not be {@code null}) * @param s the string to split * @return a list of tokens, or an empty list if {@code s} is {@code null} or empty */ public static List<String> split(Analyzer analyzer, final String s) { checkNotNull(analyzer, "analyzer"); if (Strings.isNullOrEmpty(s)) { return ImmutableList.of(); } final List<String> tokens = Lists.newArrayList(); TokenStream stream = null; try { stream = analyzer.tokenStream(null, new StringReader(s)); stream.reset(); while (stream.incrementToken()) { tokens.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (final IOException ignored) { // Should not be thrown when using a string reader } finally { endAndCloseQuietly(stream); } return tokens; }
From source file:com.basistech.elasticsearch.index.analysis.rosette.SimpleRosetteAnalysisTests.java
License:Open Source License
public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); Assert.assertNotNull(termAttr);/*from ww w . ja va2 s . com*/ int i = 0; while (stream.incrementToken()) { String s = termAttr.toString(); //out.printf("Output Token %2d: %s%n", i, s); Assert.assertTrue(i < expected.length, "got extra term: " + s); Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i); i++; } Assert.assertEquals(i, expected.length, "not all tokens produced"); }