List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:org.usergrid.utils.IndexUtils.java
License:Apache License
public static String keywordText(String source) { TokenStream ts = analyzer.tokenStream("keywords", new StringReader(source)); StringBuilder builder = new StringBuilder(); boolean first = true; try {//from www .j av a 2 s .c o m while (ts.incrementToken()) { if (!first) { builder.append(' '); } first = false; builder.append(ts.getAttribute(TermAttribute.class).term()); } } catch (IOException e) { logger.error("Error getting keywords ", e); } return builder.toString(); }
From source file:perf.TestAnalyzerPerf.java
License:Apache License
private static void testAnalyzer(String desc, File wikiLinesFile, Analyzer a, int warmupCount, int runCount) throws Exception { System.out.println("\nTEST: " + desc); // 64 KB buffer InputStream is = new FileInputStream(wikiLinesFile); BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16); long startTime = System.currentTimeMillis(); long sumTime = 0; long hash = 0; long tokenCount = 0; int totCount = warmupCount + runCount; for (int i = 0; i < totCount; i++) { boolean isWarmup = i < warmupCount; if (i % 10000 == 0) { System.out.println(String.format(Locale.ROOT, "%.1f sec: %d...", (System.currentTimeMillis() - startTime) / 1000.0, i)); }//from w w w . j a v a 2 s.co m String s = reader.readLine(); long t0 = System.nanoTime(); TokenStream ts = a.tokenStream("field", new StringReader(s)); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt; if (ts.hasAttribute(PositionIncrementAttribute.class)) { posIncAtt = ts.getAttribute(PositionIncrementAttribute.class); } else { posIncAtt = null; } OffsetAttribute offsetAtt; if (ts.hasAttribute(OffsetAttribute.class)) { offsetAtt = ts.getAttribute(OffsetAttribute.class); } else { offsetAtt = null; } while (ts.incrementToken()) { hash += 31 * ArrayUtil.hashCode(termAtt.buffer(), 0, termAtt.length()); if (posIncAtt != null) { hash += 31 * posIncAtt.getPositionIncrement(); } if (offsetAtt != null) { hash += 31 * offsetAtt.startOffset(); hash += 31 * offsetAtt.endOffset(); } if (isWarmup == false) { tokenCount++; } } ts.end(); ts.close(); if (isWarmup == false) { sumTime += System.nanoTime() - t0; } } reader.close(); System.out.println(String.format(Locale.ROOT, "%s time=%.2f msec hash=%d tokens=%d", desc, (sumTime / 1000000.0), hash, tokenCount)); }
From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java
License:Apache License
@Override public List<Posting> query(String query) { try {//w w w . jav a 2 s . c om List<Posting> finalScores = new ArrayList<Posting>(12); List<WordKey> cipheredWords = new LinkedList<WordKey>(); TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query))); try { ts.reset(); while (ts.incrementToken()) { String word = ts.getAttribute(CharTermAttribute.class).toString(); if (word.length() > 0) cipheredWords.add(new WordKey(crypto.encryptWordKey(word))); } ts.end(); } finally { ts.close(); } List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords); for (CipheredPostingList cipherPostings : cipheredPostings) { PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings()); PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size()); for (TermFreq tf : tfs.getPostings()) postings.add(new Posting(tf.getDocId(), tf.getFreq())); //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(), // docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths))); Posting posting; while ((posting = postings.poll()) != null) { //if (!removedDocs.containsKey(posting.getDocId())) { int j = finalScores.indexOf(posting); if (j == -1) finalScores.add(posting); else finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore()); } } Collections.sort(finalScores); if (finalScores.size() > 12) return finalScores.subList(0, 12); else return finalScores; } catch (IOException e) { e.printStackTrace(); return null; } }
From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKI.java
License:Apache License
@Override public void addFirstDocuments(String xmlFile) { WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser(xmlFile); try {/* w ww. j a va2 s .co m*/ wxsp.setPageCallback(new PageCallbackHandler() { public void process(WikiPage page) { if (page.isDisambiguationPage() || page.isRedirect() || page.isSpecialPage()) return; List<WordKey> cipheredWords = new ArrayList<WordKey>(); try { TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(page.getText()))); try { ts.reset(); while (ts.incrementToken()) { String word = ts.getAttribute(CharTermAttribute.class).toString(); if (word.length() > 0) cipheredWords.add(new WordKey(crypto.encryptWordKey(word))); } ts.end(); } finally { ts.close(); } } catch (IOException e) { e.printStackTrace(); } search.addFirstDocuments(new CDocument(new WordKey(crypto.digest(page.getTitle().getBytes())), cipheredWords.toArray(new WordKey[cipheredWords.size()]))); //store doc in the cloud // cloud.putDoc(""+i, crypto.encryptDocAES(documents[i])); } }); wxsp.parse(); search.buildIndex(); } catch (Exception e) { e.printStackTrace(); } }
From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKIHom.java
License:Apache License
@Override public List<Posting> query(String query) { try {// ww w . ja v a 2 s . c om List<Posting> finalScores = new ArrayList<Posting>(12); List<WordKey> cipheredWords = new LinkedList<WordKey>(); TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query))); try { ts.reset(); while (ts.incrementToken()) { String word = ts.getAttribute(CharTermAttribute.class).toString(); if (word.length() > 0) cipheredWords.add(new WordKey(crypto.encryptWordKey(word))); } ts.end(); } finally { ts.close(); } List<HomPosting> homScores = search.processQuery(cipheredWords); for (HomPosting posting : homScores) { finalScores.add(new Posting(posting.getDocId(), crypto.decryptHom(posting.getScore()).intValue())); } Collections.sort(finalScores); if (finalScores.size() > 12) return finalScores.subList(0, 12); else return finalScores; } catch (IOException e) { e.printStackTrace(); return null; } }
From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java
License:Apache License
public List<Posting> query(String query) { try {//from w w w .j a v a 2 s . com List<Posting> finalScores = new ArrayList<Posting>(12); List<WordKey> cipheredWords = new LinkedList<WordKey>(); TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query))); try { ts.reset(); while (ts.incrementToken()) { String word = ts.getAttribute(CharTermAttribute.class).toString(); if (word.length() > 0) cipheredWords.add(new WordKey(crypto.encryptWordKey(word))); } ts.end(); } finally { ts.close(); } List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords); for (CipheredPostingList cipherPostings : cipheredPostings) { PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings()); PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size()); for (TermFreq tf : tfs.getPostings()) postings.add(new Posting(tf.getDocId(), tf.getFreq())); //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(), // docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths))); Posting posting; while ((posting = postings.poll()) != null) { //if (!removedDocs.containsKey(posting.getDocId())) { int j = finalScores.indexOf(posting); if (j == -1) finalScores.add(posting); else finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore()); } } Collections.sort(finalScores); if (finalScores.size() > 12) return finalScores.subList(0, 12); else return finalScores; } catch (IOException e) { e.printStackTrace(); return null; } }
From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java
License:Apache License
public void addFirstDocuments(File[] docs) { try {//from ww w . j a v a 2 s . co m // File f = new File(path); // File[] docs = f.listFiles(); for (int i = 0; i < docs.length; i++) { String content = Utils.readFileAsString(docs[i]); List<WordKey> cipheredWords = new ArrayList<WordKey>(); TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(content))); try { ts.reset(); while (ts.incrementToken()) { String word = ts.getAttribute(CharTermAttribute.class).toString(); if (word.length() > 0) cipheredWords.add(new WordKey(crypto.encryptWordKey(word))); } ts.end(); } finally { ts.close(); } search.addFirstDocuments(crypto.encryptAES(docs[i].getName().getBytes()), cipheredWords); storage.putDoc("" + i, crypto.encryptAES(Utils.serializeObject(new PDocument(docs[i].getName(), content)))); } } catch (Exception e) { e.printStackTrace(); } }
From source file:relevantfile.XmlParser.java
License:Open Source License
/********************************************************************************************/ public static String removeStopWordsAndStem(String input) throws IOException { /*String[] stop_word={"abstract","assert","boolean","break","byte","case","catch","char","class","const","continue" ,"default","do","double","else","enum","extends","final","finally","float","for","goto","if","implements","import","instanceof","int" ,"interface","long","native","new","package","private","protected","public","return","short","static","strictfp","super", "switch","synchronized","this","throw","throws","transient","try","void","volatile","while","false","null","true"};*/ String[] stop_word = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while", "abstract", "as", "base", "bool", "byte", "catch", "checked", "class", "decimal", "delegate", "event", "explicit", "false", "finally", "fixed", "foreach", "implicit", "in", "interface", "internal", "is", "lock", "namespace", "new", "null", "object", "operator", "out", "override", "params", "private", "protected", "public", "readonly", "ref", "sbyte", "sealed", "stackalloc", "string", "this", "throw", "true", "try", "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using", "virtual" }; ArrayList<String> stopWords = new ArrayList<String>(); for (int k = 0; k < stop_word.length; k++) stopWords.add(stop_word[k]);//from www .j a va2 s.co m TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(input)); tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StandardAnalyzer.STOP_WORDS_SET); tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StopFilter.makeStopSet(Version.LUCENE_46, stopWords)); tokenStream = new PorterStemFilter(tokenStream); StringBuilder sb = new StringBuilder(); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { if (sb.length() > 0) { sb.append(" "); } sb.append(token.toString()); } tokenStream.end(); tokenStream.close(); return sb.toString(); }
From source file:reviews.searching.SearchReviews.java
License:Apache License
public static void displayTokenStream(TokenStream tokenStream) throws IOException { TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); TypeAttribute typeAtt = (TypeAttribute) tokenStream.getAttribute(TypeAttribute.class); while (tokenStream.incrementToken()) { System.out.println(termAtt.term()); System.out.println("Type: " + typeAtt.type()); System.out.println();/*from w ww . ja v a 2s. c o m*/ } }
From source file:searching.QueryExpansion.java
/** * /* w w w. ja v a 2 s. c om*/ * store frequencies of top docs in maps * * @param text * @param doc_score * @param analyzer * @param reader * @throws IOException */ public void addExpansionDoc(String text, double doc_score, Analyzer analyzer, IndexReader reader) throws IOException { if (actual_pdocs < QueryExpansion.pseudo_rel_docs) { TreeMap<String, Double> map = new TreeMap(); Integer length = 0; Double f; String term; TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text)); //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { f = map.get(term); if (f == null) { map.put(term, 1.0); } else { map.put(term, f + 1.0); } length++; } } ts.end(); } finally { ts.close(); } pdocs[actual_pdocs] = map; pdoc_lengths[actual_pdocs] = length; pdoc_scores[actual_pdocs] = doc_score; //logger.info(observed_bg_mass[iter] + "\t" + (1-observed_bg_prob)); actual_pdocs++; } }