List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:cl.usach.escalemania.sessionbeans.DocumentoFacade.java
public List<String> tokenizeString(Analyzer analyzer, String tweet) { List<String> result = new ArrayList<String>(); try {/*from ww w.j a v a 2s . c om*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(tweet)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:cn.com.szgao.enterprise.ExecutorsIndustry.java
@SuppressWarnings("rawtypes") private void readFileByLines(File file, int startNum) throws Exception { // ??//from w ww. j a v a 2 s . co m int basicSum = 0; // ?? int regSum = 0; // ??? String batchNum = file.getName(); // StringBuffer sb = new StringBuffer(); // FileReader fr = new FileReader(file); // int ch = 0; // while((ch = fr.read())!=-1 ) // { // sb.append((char)ch); // } // fr.close(); // fr = null; // BufferedWriter fw = null; String encoding_from = "UTF-8";// GB18030 // String encoding_to = "UTF-8"; BufferedReader reader = null; try { // ? GB18030 // InputStreamReader isr = new InputStreamReader(new // FileInputStream(file), "UTF-8"); // InputStreamReader isr = new InputStreamReader(new // FileInputStream(file), "GBK"); InputStreamReader isr = new InputStreamReader(new FileInputStream(file), encoding_from); reader = new BufferedReader(isr); } catch (FileNotFoundException e1) { e1.printStackTrace(); } String tempT = null; int readNum = 0; // E:\\data\\--0008.txt String ss = file.getPath().substring(file.getPath().indexOf("data") + 4); String ss2 = file.getPath().substring(file.getPath().indexOf("data") + 5, file.getPath().lastIndexOf("\\")); // String folderPath = "D:/lm/log/?/" + ss2; String filePath = file.getPath().replace("???", "???industryId") .replace("E:", "D:"); // // FileUtils.newFolder(folderPath); File fileS = new File(filePath); String encoding_from1 = "UTF-8"; BufferedWriter fw = null; try { if (!fileS.exists()) { try { fileS.createNewFile(); } catch (IOException e) { e.printStackTrace(); log.error(e); } fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileS, true), encoding_from1)); // ???? } else { fileS.delete(); fileS = new File(filePath); fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileS, true), encoding_from1)); // ???? } } catch (FileNotFoundException e1) { e1.printStackTrace(); } int M = 1;// 1M? while ((tempT = reader.readLine()) != null) { // tempT=""; long size = tempT.getBytes().length; if (size > M * 1024 * 1024) { log.error("--------?1M :" + tempT.substring(0, 500)); continue; } /** * */ JSONArray holderArray = null; /** * */ JSONArray holderDetArray = null; List<String> beforeList = new ArrayList<String>(); List<String> afterList = new ArrayList<String>(); EnterpriseVO enterVO = new EnterpriseVO(); EnterpriseVO enterVOT = new EnterpriseVO(); readNum++; // if(readNum<34431){ // continue; // } // if(readNum==34431){ // // writerString(fwUn, tempT); // break; // } if (readNum < startNum) { continue; } System.out.println("-->>>>>>> " + (readNum) + "---??" + Thread.currentThread().getName() + "---" + file.getPath()); if (tempT == null || tempT == "") { continue; } String doString = tempT.toString(); JSONObject obj = null; // System.out.println("in..." + doString); try { enterVO = gs.fromJson(tempT, EnterpriseVO.class); } catch (Exception e) { e.printStackTrace(); log.error(e); continue; // return; } String industry = null; // ? Analyzer anal = new IKAnalyzer(); String indu = removeBlank(enterVO.getScope()); IndustryVO ivo = null; IndustryVO ivo1 = null; if (StringUtils.isNull(industry)) { if (!StringUtils.isNull(indu)) { String[] sourceStrArray = indu.split("[;.:,]");// ? for (String str : sourceStrArray) { // System.out.println("-- " + str); ivo = getIndustry(str); if (!StringUtils.isNull(ivo.getIndustry_id())) { break; } // ik? if (null == ivo.getIndustry_id()) { StringReader reader1 = new StringReader(str); // ? TokenStream ts = null; ts = anal.tokenStream("", reader1); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); try { ts.reset(); String strs = ""; // ???? while (ts.incrementToken()) { String temp = term.toString(); if (!StringUtils.isNull(temp)) { // getIndustry(temp); strs += term.toString() + "|"; // System.out.print(term.toString() + "|" ); } } reader1.close(); String[] arrStr1 = strs.split("\\|"); StringUtils.sortStringArray(arrStr1, 1);// ? List<IndustryVO> listiv = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_v = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_n = new ArrayList<IndustryVO>(); for (int i = 0; i < arrStr1.length; i++) { String temp = arrStr1[i]; if (!StringUtils.isNull(temp)) { ivo1 = getIndustry(temp); if (!StringUtils.isNull(ivo1.getIndustry_id())) { listiv.add(ivo1); // break; } } } // ????? if (listiv.size() > 0) { for (IndustryVO industryVO : listiv) { if ("V".equals(industryVO.getFlag())) { listiv_v.add(industryVO); } else { listiv_n.add(industryVO); } } } if (listiv_v.size() > 0) { ivo = getfirstSortStringArray(listiv_v, 1); break; } if (listiv_n.size() > 0) { ivo = getfirstSortStringArray(listiv_n, 1); break; } } catch (IOException e) { e.printStackTrace(); } } if (!StringUtils.isNull(ivo.getIndustry_id())) { break; } } } } if (null != ivo) { enterVO.setIndustry(ivo.getIndustry_name()); enterVO.setIndustryId(ivo.getIndustry_id()); } // System.out.println("in..." + obj); writerString(fw, StringUtils.GSON.toJson(enterVO)); } log.info("?regSum: " + regSum + " ?basicSum: " + basicSum + " readNum: " + readNum + " -: " + (readNum - basicSum - regSum) + "---??" + file.getPath()); }
From source file:cn.com.szgao.enterprise.ExecutorsIndustry.java
/** * //from w w w . j a v a 2 s . c o m * * @param enterVO * @return */ public static EnterpriseVO washEnterpriseVO(EnterpriseVO enterVO) throws IOException { if (!StringUtils.isNull(enterVO.getCompany())) { enterVO.setCompany(deleteMoreFuhao(enterVO.getCompany())); } // ?? String scope = enterVO.getScope(); if (!StringUtils.isNull(scope)) { scope = scope.replaceAll("[ \r\t^]", ""); if (scope.length() > 5) { String temp = ""; // System.out.println(scope); // if (scope.substring(scope.length() - 5).contains("^")) { // temp = scope.substring(scope.length() - 5).replace("^", ""); // scope = scope.substring(0, scope.length() - 5) + temp; // } else { temp = scope.substring(scope.length() - 5).replaceAll("[#*+]", ""); scope = scope.substring(0, scope.length() - 5) + temp; // } } enterVO.setScope(scope); } String regCapital = enterVO.getRegCapital(); // ?   if (!StringUtils.isNull(regCapital)) { String capital = enterVO.getRegCapital().replaceAll("[ \r\t\n]", ""); // enterVO.setRegCapital(capital); enterVO.setRegCapitalO(capital); Double regCapitalN = getBankingDouble(capital); if (null != regCapitalN) { enterVO.setRegCapitalN(regCapitalN); enterVO.setRegCapital(null); if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit("?"); } else if (regCapital.contains("?")) { enterVO.setUnit("??"); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("?")) { enterVO.setUnit("?"); } else if (regCapital.contains("?")) { enterVO.setUnit("?"); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("?")) { enterVO.setUnit("?"); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else if (regCapital.contains("")) { enterVO.setUnit(""); } else { enterVO.setUnit("?"); } } } // ? String industry = null; String industryId = null; /* * if (!StringUtils.isNull(enterVO.getCompany())) { String key = * StringUtils.NBG.generate(enterVO.getCompany()).toString(); * JsonDocument queryDoc = ExecutorsText.bucket.get(key, 60, * TimeUnit.MINUTES); * * if (null == queryDoc) { // System.out.println( * "NULL queryDoc-------------------- " + // key); // * System.out.println("NULL queryDoc-------------------- " + // key); * * } else { BusinessDirectoryVO vo = * StringUtils.GSON.fromJson(queryDoc.content().toString(), * BusinessDirectoryVO.class); if * (!StringUtils.isNull(vo.getIndustry())) { industry = * vo.getIndustry(); } } } */ // ? Analyzer anal = new IKAnalyzer(); String indu = removeBlank(enterVO.getScope()); IndustryVO ivo = null; IndustryVO ivo1 = null; if (StringUtils.isNull(industry)) { if (!StringUtils.isNull(indu)) { String[] sourceStrArray = indu.split("[;.:,]");// ? for (String str : sourceStrArray) { // System.out.println("-- " + str); ivo = getIndustry(str); if (!StringUtils.isNull(ivo.getIndustry_id())) { break; } // ik? if (null == ivo.getIndustry_id()) { StringReader reader = new StringReader(str); // ? TokenStream ts = null; ts = anal.tokenStream("", reader); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); try { ts.reset(); String strs = ""; // ???? while (ts.incrementToken()) { String temp = term.toString(); if (!StringUtils.isNull(temp)) { // getIndustry(temp); strs += term.toString() + "|"; // System.out.print(term.toString() + "|" ); } } reader.close(); String[] arrStr1 = strs.split("\\|"); StringUtils.sortStringArray(arrStr1, 1);// ? List<IndustryVO> listiv = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_v = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_n = new ArrayList<IndustryVO>(); for (int i = 0; i < arrStr1.length; i++) { String temp = arrStr1[i]; if (!StringUtils.isNull(temp)) { ivo1 = getIndustry(temp); if (!StringUtils.isNull(ivo1.getIndustry_id())) { listiv.add(ivo1); // break; } } } // ????? if (listiv.size() > 0) { for (IndustryVO industryVO : listiv) { if ("V".equals(industryVO.getFlag())) { listiv_v.add(industryVO); } else { listiv_n.add(industryVO); } } } if (listiv_v.size() > 0) { ivo = getfirstSortStringArray(listiv_v, 1); break; } if (listiv_n.size() > 0) { ivo = getfirstSortStringArray(listiv_n, 1); break; } } catch (IOException e) { e.printStackTrace(); } } if (!StringUtils.isNull(ivo.getIndustry_id())) { break; } } } } if (null != ivo) { enterVO.setIndustry(ivo.getIndustry_name()); enterVO.setIndustryId(ivo.getIndustry_id()); } if (!StringUtils.isNull(enterVO.getRegNum())) { if (enterVO.getRegNum().indexOf("\u0000") != -1) { enterVO.setRegNum(enterVO.getRegNum().substring(0, enterVO.getRegNum().indexOf("\u0000"))); } } enterVO.setHolder(WashEtp.clearHolder(enterVO));// enterVO.setPunishment(WashEtp.clearPunishment(enterVO));// ? enterVO.setChange(WashEtp.clearChangeItem(enterVO));// ? enterVO.setReport(WashEtp.clearReport(enterVO));// return enterVO; }
From source file:cn.com.szgao.enterprise.FileIntoDataBase2p5.java
/** * ik??/* w w w . ja v a 2 s . c om*/ * * @param str * @return * @return IndustryVO * @author liuming * @date 2016629 ?2:36:09 */ public static IndustryVO getIndustryVOFromIk(String str) { // ? Analyzer anal = new IKAnalyzer(); IndustryVO ivo = new IndustryVO(); IndustryVO ivo1 = new IndustryVO(); StringReader reader = new StringReader(str); // ? TokenStream ts = null; ts = anal.tokenStream("", reader); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); try { ts.reset(); String strs = ""; // ???? while (ts.incrementToken()) { String temp = term.toString(); if (!StringUtils.isNull(temp)) { // getIndustry(temp); strs += term.toString() + "|"; // System.out.print(term.toString() + "|" ); } } reader.close(); String[] arrStr1 = strs.split("\\|"); StringUtils.sortStringArray(arrStr1, 1);// ? List<IndustryVO> listiv = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_v = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_n = new ArrayList<IndustryVO>(); for (int i = 0; i < arrStr1.length; i++) { String temp = arrStr1[i]; if (!StringUtils.isNull(temp)) { ivo1 = getIndustry(temp); if (!StringUtils.isNull(ivo1.getIndustry_id())) { listiv.add(ivo1); // break; } } } // ????? // if (listiv.size() > 0) { // for (IndustryVO industryVO : listiv) { // if ("V".equals(industryVO.getFlag())) { // listiv_v.add(industryVO); // } else { // listiv_n.add(industryVO); // } // } // } // ??? if (listiv.size() > 0) { ivo = getfirstSortStringArray(listiv, 1); } // if (listiv_v.size() > 0) { // ivo = getfirstSortStringArray(listiv_v, 1); // } // if (listiv_n.size() > 0) { // ivo = getfirstSortStringArray(listiv_n, 1); // } } catch (IOException e) { e.printStackTrace(); } return ivo; }
From source file:cn.com.szgao.enterprise.FileIntoDataBase2p5WashJson.java
/** * ik??//w w w .ja v a 2 s .c o m * * @param str * @return * @return IndustryVO * @author liuming * @date 2016629 ?2:36:09 */ public static IndustryVO getIndustryVOFromIk(String str) { // ? Analyzer anal = new IKAnalyzer(); IndustryVO ivo = new IndustryVO(); IndustryVO ivo1 = new IndustryVO(); StringReader reader = new StringReader(str); // ? TokenStream ts = null; ts = anal.tokenStream("", reader); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); try { ts.reset(); String strs = ""; // ???? while (ts.incrementToken()) { String temp = term.toString(); if (!StringUtils.isNull(temp)) { // getIndustry(temp); strs += term.toString() + "|"; // System.out.print(term.toString() + "|" ); } } reader.close(); String[] arrStr1 = strs.split("\\|"); StringUtils.sortStringArray(arrStr1, 1);// ? List<IndustryVO> listiv = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_v = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_n = new ArrayList<IndustryVO>(); for (int i = 0; i < arrStr1.length; i++) { String temp = arrStr1[i]; if (!StringUtils.isNull(temp)) { ivo1 = getIndustry(temp); if (!StringUtils.isNull(ivo1.getIndustry_id())) { listiv.add(ivo1); // break; } } } // ????? // if (listiv.size() > 0) { // for (IndustryVO industryVO : listiv) { // if ("V".equals(industryVO.getFlag())) { // listiv_v.add(industryVO); // } else { // listiv_n.add(industryVO); // } // } // } // ??? if (listiv.size() > 0) { ivo = getfirstSortStringArray(listiv, 1); } // if (listiv_v.size() > 0) { // ivo = getfirstSortStringArray(listiv_v, 1); // } // if (listiv_n.size() > 0) { // ivo = getfirstSortStringArray(listiv_n, 1); // } } catch (IOException e) { e.printStackTrace(); } return ivo; }
From source file:cn.jcenterhome.web.action.CpAction.java
private List<String> getKeyWord(String text) throws IOException { List<String> keywords = new ArrayList<String>(); if (!Common.empty(text)) { Map<String, Integer> words = new HashMap<String, Integer>(); Analyzer analyzer = new IKAnalyzer(true); StringReader reader = new StringReader(text); TokenStream tokenStream = analyzer.tokenStream("*", reader); TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { String word = termAtt.term(); if (word.length() > 1 && Common.strlen(word) > 2) { Integer count = words.get(word); if (count == null) { count = 0;/*from www . jav a 2s.c om*/ } words.put(word, count + 1); } } if (words.size() > 0) { Directory dir = null; IndexSearcher searcher = null; try { String fieldName = "text"; dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); writer.close(); searcher = new IndexSearcher(dir); searcher.setSimilarity(new IKSimilarity()); Set<String> keys = words.keySet(); Map<String, Float> temps = new HashMap<String, Float>(); for (String key : keys) { int count = words.get(key); Query query = IKQueryParser.parse(fieldName, key); TopDocs topDocs = searcher.search(query, 1); if (topDocs.totalHits > 0) { temps.put(key, topDocs.getMaxScore() * count); } } Entry<String, Float>[] keywordEntry = getSortedHashtableByValue(temps); for (Entry<String, Float> entry : keywordEntry) { if (keywords.size() < 5) { keywords.add(entry.getKey()); } } } catch (Exception e) { e.printStackTrace(); } finally { try { searcher.close(); } catch (IOException e) { e.printStackTrace(); } try { dir.close(); } catch (IOException e) { e.printStackTrace(); } } } } return keywords; }
From source file:com.antsdb.saltedfish.sql.vdm.LuceneUtil.java
License:Open Source License
static void tokenize(String text, BiConsumer<String, String> lambda) { try (StandardAnalyzer analyzer = new StandardAnalyzer()) { TokenStream stream = analyzer.tokenStream("", text); CharTermAttribute term = stream.getAttribute(CharTermAttribute.class); TypeAttribute type = stream.getAttribute(TypeAttribute.class); stream.reset();//from ww w . ja v a 2s . c o m while (stream.incrementToken()) { lambda.accept(type.type(), term.toString()); } } catch (IOException x) { throw new RuntimeException(x); } }
From source file:com.b2international.index.compat.Highlighting.java
License:Apache License
/** * Splits a string to a list of tokens using the specified Lucene analyzer. * /*w w w. j a va 2 s. co m*/ * @param analyzer the analyzer determining token boundaries (may not be {@code null}) * @param s the string to split * @return a list of tokens, or an empty list if {@code s} is {@code null} or empty */ public static List<String> split(Analyzer analyzer, final String s) { checkNotNull(analyzer, "analyzer"); if (Strings.isNullOrEmpty(s)) { return ImmutableList.of(); } final List<String> tokens = Lists.newArrayList(); TokenStream stream = null; try { stream = analyzer.tokenStream(null, new StringReader(s)); stream.reset(); while (stream.incrementToken()) { tokens.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (final IOException ignored) { // Should not be thrown when using a string reader } finally { endAndCloseQuietly(stream); } return tokens; }
From source file:com.basistech.elasticsearch.index.analysis.rosette.SimpleRosetteAnalysisTests.java
License:Open Source License
public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset();//www .j av a 2s .c o m CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); Assert.assertNotNull(termAttr); int i = 0; while (stream.incrementToken()) { String s = termAttr.toString(); //out.printf("Output Token %2d: %s%n", i, s); Assert.assertTrue(i < expected.length, "got extra term: " + s); Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i); i++; } Assert.assertEquals(i, expected.length, "not all tokens produced"); }
From source file:com.basistech.IndexFiles.java
License:Open Source License
private void iterateOverFiles(File directory) throws IOException { File[] textFiles = directory.listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { return name.endsWith(".txt"); }/* w w w . j ava 2s. c o m*/ }); for (File dataFile : textFiles) { Reader dataReader = null; try { dataReader = Files.newReader(dataFile, Charsets.UTF_8); TokenStream tokenStream = analyzer.tokenStream("full_text", dataReader); tokenStream.reset(); OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class); while (tokenStream.incrementToken()) { offsets.startOffset(); } } finally { IOUtils.closeQuietly(dataReader); } } }