List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:cl.usach.escalemania.sessionbeans.DocumentoFacade.java
public List<String> tokenizeString(Analyzer analyzer, String tweet) { List<String> result = new ArrayList<String>(); try {/*from w w w . j ava 2 s.c o m*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(tweet)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:cn.com.szgao.enterprise.ExecutorsIndustry.java
@SuppressWarnings("rawtypes") private void readFileByLines(File file, int startNum) throws Exception { // ??//from w ww . j a va2 s . c o m int basicSum = 0; // ?? int regSum = 0; // ??? String batchNum = file.getName(); // StringBuffer sb = new StringBuffer(); // FileReader fr = new FileReader(file); // int ch = 0; // while((ch = fr.read())!=-1 ) // { // sb.append((char)ch); // } // fr.close(); // fr = null; // BufferedWriter fw = null; String encoding_from = "UTF-8";// GB18030 // String encoding_to = "UTF-8"; BufferedReader reader = null; try { // ? GB18030 // InputStreamReader isr = new InputStreamReader(new // FileInputStream(file), "UTF-8"); // InputStreamReader isr = new InputStreamReader(new // FileInputStream(file), "GBK"); InputStreamReader isr = new InputStreamReader(new FileInputStream(file), encoding_from); reader = new BufferedReader(isr); } catch (FileNotFoundException e1) { e1.printStackTrace(); } String tempT = null; int readNum = 0; // E:\\data\\--0008.txt String ss = file.getPath().substring(file.getPath().indexOf("data") + 4); String ss2 = file.getPath().substring(file.getPath().indexOf("data") + 5, file.getPath().lastIndexOf("\\")); // String folderPath = "D:/lm/log/?/" + ss2; String filePath = file.getPath().replace("???", "???industryId") .replace("E:", "D:"); // // FileUtils.newFolder(folderPath); File fileS = new File(filePath); String encoding_from1 = "UTF-8"; BufferedWriter fw = null; try { if (!fileS.exists()) { try { fileS.createNewFile(); } catch (IOException e) { e.printStackTrace(); log.error(e); } fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileS, true), encoding_from1)); // ???? } else { fileS.delete(); fileS = new File(filePath); fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileS, true), encoding_from1)); // ???? } } catch (FileNotFoundException e1) { e1.printStackTrace(); } int M = 1;// 1M? while ((tempT = reader.readLine()) != null) { // tempT=""; long size = tempT.getBytes().length; if (size > M * 1024 * 1024) { log.error("--------?1M :" + tempT.substring(0, 500)); continue; } /** * */ JSONArray holderArray = null; /** * */ JSONArray holderDetArray = null; List<String> beforeList = new ArrayList<String>(); List<String> afterList = new ArrayList<String>(); EnterpriseVO enterVO = new EnterpriseVO(); EnterpriseVO enterVOT = new EnterpriseVO(); readNum++; // if(readNum<34431){ // continue; // } // if(readNum==34431){ // // writerString(fwUn, tempT); // break; // } if (readNum < startNum) { continue; } System.out.println("-->>>>>>> " + (readNum) + "---??" + Thread.currentThread().getName() + "---" + file.getPath()); if (tempT == null || tempT == "") { continue; } String doString = tempT.toString(); JSONObject obj = null; // System.out.println("in..." + doString); try { enterVO = gs.fromJson(tempT, EnterpriseVO.class); } catch (Exception e) { e.printStackTrace(); log.error(e); continue; // return; } String industry = null; // ? Analyzer anal = new IKAnalyzer(); String indu = removeBlank(enterVO.getScope()); IndustryVO ivo = null; IndustryVO ivo1 = null; if (StringUtils.isNull(industry)) { if (!StringUtils.isNull(indu)) { String[] sourceStrArray = indu.split("[;.:,]");// ? for (String str : sourceStrArray) { // System.out.println("-- " + str); ivo = getIndustry(str); if (!StringUtils.isNull(ivo.getIndustry_id())) { break; } // ik? if (null == ivo.getIndustry_id()) { StringReader reader1 = new StringReader(str); // ? TokenStream ts = null; ts = anal.tokenStream("", reader1); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); try { ts.reset(); String strs = ""; // ???? while (ts.incrementToken()) { String temp = term.toString(); if (!StringUtils.isNull(temp)) { // getIndustry(temp); strs += term.toString() + "|"; // System.out.print(term.toString() + "|" ); } } reader1.close(); String[] arrStr1 = strs.split("\\|"); StringUtils.sortStringArray(arrStr1, 1);// ? List<IndustryVO> listiv = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_v = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_n = new ArrayList<IndustryVO>(); for (int i = 0; i < arrStr1.length; i++) { String temp = arrStr1[i]; if (!StringUtils.isNull(temp)) { ivo1 = getIndustry(temp); if (!StringUtils.isNull(ivo1.getIndustry_id())) { listiv.add(ivo1); // break; } } } // ????? if (listiv.size() > 0) { for (IndustryVO industryVO : listiv) { if ("V".equals(industryVO.getFlag())) { listiv_v.add(industryVO); } else { listiv_n.add(industryVO); } } } if (listiv_v.size() > 0) { ivo = getfirstSortStringArray(listiv_v, 1); break; } if (listiv_n.size() > 0) { ivo = getfirstSortStringArray(listiv_n, 1); break; } } catch (IOException e) { e.printStackTrace(); } } if (!StringUtils.isNull(ivo.getIndustry_id())) { break; } } } } if (null != ivo) { enterVO.setIndustry(ivo.getIndustry_name()); enterVO.setIndustryId(ivo.getIndustry_id()); } // System.out.println("in..." + obj); writerString(fw, StringUtils.GSON.toJson(enterVO)); } log.info("?regSum: " + regSum + " ?basicSum: " + basicSum + " readNum: " + readNum + " -: " + (readNum - basicSum - regSum) + "---??" + file.getPath()); }
From source file:cn.com.szgao.enterprise.ExecutorsIndustry.java
/** * // www. j a v a 2 s .c o m * * @param enterVO * @return */ public static EnterpriseVO washEnterpriseVO(EnterpriseVO enterVO) throws IOException { if (!StringUtils.isNull(enterVO.getCompany())) { enterVO.setCompany(deleteMoreFuhao(enterVO.getCompany())); } // ?? String scope = enterVO.getScope(); if (!StringUtils.isNull(scope)) { scope = scope.replaceAll("[ \r\t^]", ""); if (scope.length() > 5) { String temp = ""; // System.out.println(scope); // if (scope.substring(scope.length() - 5).contains("^")) { // temp = scope.substring(scope.length() - 5).replace("^", ""); // scope = scope.substring(0, scope.length() - 5) + temp; // } else { temp = scope.substring(scope.length() - 5).replaceAll("[#*+]", ""); scope = scope.substring(0, scope.length() - 5) + temp; // } } enterVO.setScope(scope); } String regCapital = enterVO.getRegCapital(); // ?   if (!StringUtils.isNull(regCapital)) { String capital = enterVO.getRegCapital().replaceAll("[ \r\t\n]", ""); // enterVO.setRegCapital(capital); enterVO.setRegCapitalO(capital); Double regCapitalN = getBankingDouble(capital); if (null != regCapitalN) { enterVO.setRegCapitalN(regCapitalN); enterVO.setRegCapital(null); if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit("?"); } else if (regCapital.contains("?")) { enterVO.setUnit("??"); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("?")) { enterVO.setUnit("?"); } else if (regCapital.contains("?")) { enterVO.setUnit("?"); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("?")) { enterVO.setUnit("?"); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else { enterVO.setUnit("?"); } } } // ? String industry = null; String industryId = null; /* * if (!StringUtils.isNull(enterVO.getCompany())) { String key = * StringUtils.NBG.generate(enterVO.getCompany()).toString(); * JsonDocument queryDoc = ExecutorsText.bucket.get(key, 60, * TimeUnit.MINUTES); * * if (null == queryDoc) { // System.out.println( * "NULL queryDoc-------------------- " + // key); // * System.out.println("NULL queryDoc-------------------- " + // key); * * } else { BusinessDirectoryVO vo = * StringUtils.GSON.fromJson(queryDoc.content().toString(), * BusinessDirectoryVO.class); if * (!StringUtils.isNull(vo.getIndustry())) { industry = * vo.getIndustry(); } } } */ // ? Analyzer anal = new IKAnalyzer(); String indu = removeBlank(enterVO.getScope()); IndustryVO ivo = null; IndustryVO ivo1 = null; if (StringUtils.isNull(industry)) { if (!StringUtils.isNull(indu)) { String[] sourceStrArray = indu.split("[;.:,]");// ? for (String str : sourceStrArray) { // System.out.println("-- " + str); ivo = getIndustry(str); if (!StringUtils.isNull(ivo.getIndustry_id())) { break; } // ik? if (null == ivo.getIndustry_id()) { StringReader reader = new StringReader(str); // ? TokenStream ts = null; ts = anal.tokenStream("", reader); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); try { ts.reset(); String strs = ""; // ???? while (ts.incrementToken()) { String temp = term.toString(); if (!StringUtils.isNull(temp)) { // getIndustry(temp); strs += term.toString() + "|"; // System.out.print(term.toString() + "|" ); } } reader.close(); String[] arrStr1 = strs.split("\\|"); StringUtils.sortStringArray(arrStr1, 1);// ? List<IndustryVO> listiv = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_v = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_n = new ArrayList<IndustryVO>(); for (int i = 0; i < arrStr1.length; i++) { String temp = arrStr1[i]; if (!StringUtils.isNull(temp)) { ivo1 = getIndustry(temp); if (!StringUtils.isNull(ivo1.getIndustry_id())) { listiv.add(ivo1); // break; } } } // ????? if (listiv.size() > 0) { for (IndustryVO industryVO : listiv) { if ("V".equals(industryVO.getFlag())) { listiv_v.add(industryVO); } else { listiv_n.add(industryVO); } } } if (listiv_v.size() > 0) { ivo = getfirstSortStringArray(listiv_v, 1); break; } if (listiv_n.size() > 0) { ivo = getfirstSortStringArray(listiv_n, 1); break; } } catch (IOException e) { e.printStackTrace(); } } if (!StringUtils.isNull(ivo.getIndustry_id())) { break; } } } } if (null != ivo) { enterVO.setIndustry(ivo.getIndustry_name()); enterVO.setIndustryId(ivo.getIndustry_id()); } if (!StringUtils.isNull(enterVO.getRegNum())) { if (enterVO.getRegNum().indexOf("\u0000") != -1) { enterVO.setRegNum(enterVO.getRegNum().substring(0, enterVO.getRegNum().indexOf("\u0000"))); } } enterVO.setHolder(WashEtp.clearHolder(enterVO));// enterVO.setPunishment(WashEtp.clearPunishment(enterVO));// ? enterVO.setChange(WashEtp.clearChangeItem(enterVO));// ? enterVO.setReport(WashEtp.clearReport(enterVO));// return enterVO; }
From source file:cn.com.szgao.enterprise.FileIntoDataBase2p5.java
/** * ik??//www. j a va 2 s . co m * * @param str * @return * @return IndustryVO * @author liuming * @date 2016629 ?2:36:09 */ public static IndustryVO getIndustryVOFromIk(String str) { // ? Analyzer anal = new IKAnalyzer(); IndustryVO ivo = new IndustryVO(); IndustryVO ivo1 = new IndustryVO(); StringReader reader = new StringReader(str); // ? TokenStream ts = null; ts = anal.tokenStream("", reader); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); try { ts.reset(); String strs = ""; // ???? while (ts.incrementToken()) { String temp = term.toString(); if (!StringUtils.isNull(temp)) { // getIndustry(temp); strs += term.toString() + "|"; // System.out.print(term.toString() + "|" ); } } reader.close(); String[] arrStr1 = strs.split("\\|"); StringUtils.sortStringArray(arrStr1, 1);// ? List<IndustryVO> listiv = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_v = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_n = new ArrayList<IndustryVO>(); for (int i = 0; i < arrStr1.length; i++) { String temp = arrStr1[i]; if (!StringUtils.isNull(temp)) { ivo1 = getIndustry(temp); if (!StringUtils.isNull(ivo1.getIndustry_id())) { listiv.add(ivo1); // break; } } } // ????? // if (listiv.size() > 0) { // for (IndustryVO industryVO : listiv) { // if ("V".equals(industryVO.getFlag())) { // listiv_v.add(industryVO); // } else { // listiv_n.add(industryVO); // } // } // } // ??? if (listiv.size() > 0) { ivo = getfirstSortStringArray(listiv, 1); } // if (listiv_v.size() > 0) { // ivo = getfirstSortStringArray(listiv_v, 1); // } // if (listiv_n.size() > 0) { // ivo = getfirstSortStringArray(listiv_n, 1); // } } catch (IOException e) { e.printStackTrace(); } return ivo; }
From source file:cn.com.szgao.enterprise.FileIntoDataBase2p5WashJson.java
/** * ik??// w ww . j a va 2 s . c o m * * @param str * @return * @return IndustryVO * @author liuming * @date 2016629 ?2:36:09 */ public static IndustryVO getIndustryVOFromIk(String str) { // ? Analyzer anal = new IKAnalyzer(); IndustryVO ivo = new IndustryVO(); IndustryVO ivo1 = new IndustryVO(); StringReader reader = new StringReader(str); // ? TokenStream ts = null; ts = anal.tokenStream("", reader); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); try { ts.reset(); String strs = ""; // ???? while (ts.incrementToken()) { String temp = term.toString(); if (!StringUtils.isNull(temp)) { // getIndustry(temp); strs += term.toString() + "|"; // System.out.print(term.toString() + "|" ); } } reader.close(); String[] arrStr1 = strs.split("\\|"); StringUtils.sortStringArray(arrStr1, 1);// ? List<IndustryVO> listiv = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_v = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_n = new ArrayList<IndustryVO>(); for (int i = 0; i < arrStr1.length; i++) { String temp = arrStr1[i]; if (!StringUtils.isNull(temp)) { ivo1 = getIndustry(temp); if (!StringUtils.isNull(ivo1.getIndustry_id())) { listiv.add(ivo1); // break; } } } // ????? // if (listiv.size() > 0) { // for (IndustryVO industryVO : listiv) { // if ("V".equals(industryVO.getFlag())) { // listiv_v.add(industryVO); // } else { // listiv_n.add(industryVO); // } // } // } // ??? if (listiv.size() > 0) { ivo = getfirstSortStringArray(listiv, 1); } // if (listiv_v.size() > 0) { // ivo = getfirstSortStringArray(listiv_v, 1); // } // if (listiv_n.size() > 0) { // ivo = getfirstSortStringArray(listiv_n, 1); // } } catch (IOException e) { e.printStackTrace(); } return ivo; }
From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java
License:Open Source License
public boolean contain(String label) { try {/*from www.jav a 2 s .com*/ IndexReader reader = IndexReader.open(this.indexDir, true); Searcher searcher = new IndexSearcher(reader); // use the boolean query HashSet<String> queryTermSet = new HashSet<String>(); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { queryTermSet.add(termAtt.term()); } stream.end(); stream.close(); // construct the query BooleanQuery bq = new BooleanQuery(); Iterator<String> it = queryTermSet.iterator(); while (it.hasNext()) { String s = it.next(); Term term = new Term(LabelDocument.FIELD_LABEL, s); TermQuery termQuery = new TermQuery(term); bq.add(termQuery, Occur.MUST); } ExactLabelQueryResultCollector collector = new ExactLabelQueryResultCollector(reader, label); searcher.search(bq, collector); boolean ret = collector.isExistQueryLabel(); reader.close(); return ret; } catch (Exception e) { e.printStackTrace(); } return false; }
From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java
License:Open Source License
public TreeSet<SimilarLabelQueryResult> getSimilarLabels(String query, float similarity) { TreeSet<SimilarLabelQueryResult> ret = new TreeSet<SimilarLabelQueryResult>(); if (query == null) { ret.add(new SimilarLabelQueryResult(null, 1)); return ret; }// www . j a va 2s . com try { IndexReader reader = IndexReader.open(this.indexDir, true); Searcher searcher = new IndexSearcher(reader); // get terms from query HashSet<String> queryTermSet = new HashSet<String>(); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(query)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { queryTermSet.add(termAtt.term()); } stream.end(); stream.close(); // construct the query BooleanQuery bq = new BooleanQuery(); Iterator<String> it = queryTermSet.iterator(); SynonymMap synMap = SynonymIndex.getSynonymMap(); HashSet<String> expandedQueryTermSet = new HashSet<String>(queryTermSet); while (it.hasNext()) { String s = it.next(); Term term = new Term(LabelDocument.FIELD_LABEL, s); TermQuery termQuery = new TermQuery(term); bq.add(termQuery, Occur.SHOULD); // expand using synonyms for (String syn : synMap.getSynonyms(s)) { stemer.setCurrent(syn); stemer.stem(); syn = stemer.getCurrent(); if (expandedQueryTermSet.add(syn)) { term = new Term(LabelDocument.FIELD_LABEL, syn); termQuery = new TermQuery(term); bq.add(termQuery, Occur.SHOULD); } } } // search in the label index SimilarLabelQueryResultCollector collector = new SimilarLabelQueryResultCollector(reader, queryTermSet, similarity); searcher.search(bq, collector); ret = collector.getQueryResult(); searcher.close(); reader.close(); } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:cn.edu.thss.iise.beehivez.server.index.luceneindex.analyzer.SemicolonAnalyzer.java
License:Open Source License
/** * @param args/*from w w w .ja v a2 s . c om*/ */ public static void main(String[] args) throws IOException { // text to tokenize final String text = "This is a demo of , the new TokenStream API"; SemicolonAnalyzer analyzer = new SemicolonAnalyzer(); TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); // get the TermAttribute from the TokenStream TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println(termAtt.term()); } stream.end(); stream.close(); }
From source file:cn.edu.thss.iise.beehivez.server.util.StringSimilarityUtil.java
License:Open Source License
/** * tokenize the given string, all the words are extracted, lowercased, all * the stop words are removed, and all the words are replaced with their * stem/*from w w w.j av a 2s .com*/ * * @param label * @return */ public static HashSet<String> snowballTokenize(String label) { HashSet<String> ret = new HashSet<String>(); try { Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English", StandardAnalyzer.STOP_WORDS_SET); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { ret.add(termAtt.term()); } stream.end(); stream.close(); } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:cn.jcenterhome.web.action.CpAction.java
private List<String> getKeyWord(String text) throws IOException { List<String> keywords = new ArrayList<String>(); if (!Common.empty(text)) { Map<String, Integer> words = new HashMap<String, Integer>(); Analyzer analyzer = new IKAnalyzer(true); StringReader reader = new StringReader(text); TokenStream tokenStream = analyzer.tokenStream("*", reader); TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { String word = termAtt.term(); if (word.length() > 1 && Common.strlen(word) > 2) { Integer count = words.get(word); if (count == null) { count = 0;// w w w. jav a 2s .c o m } words.put(word, count + 1); } } if (words.size() > 0) { Directory dir = null; IndexSearcher searcher = null; try { String fieldName = "text"; dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); writer.close(); searcher = new IndexSearcher(dir); searcher.setSimilarity(new IKSimilarity()); Set<String> keys = words.keySet(); Map<String, Float> temps = new HashMap<String, Float>(); for (String key : keys) { int count = words.get(key); Query query = IKQueryParser.parse(fieldName, key); TopDocs topDocs = searcher.search(query, 1); if (topDocs.totalHits > 0) { temps.put(key, topDocs.getMaxScore() * count); } } Entry<String, Float>[] keywordEntry = getSortedHashtableByValue(temps); for (Entry<String, Float> entry : keywordEntry) { if (keywords.size() < 5) { keywords.add(entry.getKey()); } } } catch (Exception e) { e.printStackTrace(); } finally { try { searcher.close(); } catch (IOException e) { e.printStackTrace(); } try { dir.close(); } catch (IOException e) { e.printStackTrace(); } } } } return keywords; }