List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:pt.unlfctdi.cryptosearch.core.client.ClientConnectorBeanWIKIHom.java
License:Apache License
@Override public List<Posting> query(String query) { try {/*from w w w . j a va 2 s.c o m*/ List<Posting> finalScores = new ArrayList<Posting>(12); List<WordKey> cipheredWords = new LinkedList<WordKey>(); TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query))); try { ts.reset(); while (ts.incrementToken()) { String word = ts.getAttribute(CharTermAttribute.class).toString(); if (word.length() > 0) cipheredWords.add(new WordKey(crypto.encryptWordKey(word))); } ts.end(); } finally { ts.close(); } List<HomPosting> homScores = search.processQuery(cipheredWords); for (HomPosting posting : homScores) { finalScores.add(new Posting(posting.getDocId(), crypto.decryptHom(posting.getScore()).intValue())); } Collections.sort(finalScores); if (finalScores.size() > 12) return finalScores.subList(0, 12); else return finalScores; } catch (IOException e) { e.printStackTrace(); return null; } }
From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java
License:Apache License
public List<Posting> query(String query) { try {/*from ww w . java2 s .c o m*/ List<Posting> finalScores = new ArrayList<Posting>(12); List<WordKey> cipheredWords = new LinkedList<WordKey>(); TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(query))); try { ts.reset(); while (ts.incrementToken()) { String word = ts.getAttribute(CharTermAttribute.class).toString(); if (word.length() > 0) cipheredWords.add(new WordKey(crypto.encryptWordKey(word))); } ts.end(); } finally { ts.close(); } List<CipheredPostingList> cipheredPostings = search.processQuery(cipheredWords); for (CipheredPostingList cipherPostings : cipheredPostings) { PostingList tfs = crypto.decryptPostingList(cipherPostings.getCipheredPostings()); PriorityQueue<Posting> postings = new PriorityQueue<Posting>(tfs.getPostings().size()); for (TermFreq tf : tfs.getPostings()) postings.add(new Posting(tf.getDocId(), tf.getFreq())); //postings.add(new Posting(tf.getDocId(), Utils.bm25(tf.getFreq(), tfs.getDf(), // docsDict.size(), docLengths.get(tf.getDocId()), sumDocLengths))); Posting posting; while ((posting = postings.poll()) != null) { //if (!removedDocs.containsKey(posting.getDocId())) { int j = finalScores.indexOf(posting); if (j == -1) finalScores.add(posting); else finalScores.get(j).setScore(finalScores.get(j).getScore() + posting.getScore()); } } Collections.sort(finalScores); if (finalScores.size() > 12) return finalScores.subList(0, 12); else return finalScores; } catch (IOException e) { e.printStackTrace(); return null; } }
From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java
License:Apache License
public void addFirstDocuments(File[] docs) { try {/* ww w. ja va 2 s. c om*/ // File f = new File(path); // File[] docs = f.listFiles(); for (int i = 0; i < docs.length; i++) { String content = Utils.readFileAsString(docs[i]); List<WordKey> cipheredWords = new ArrayList<WordKey>(); TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(content))); try { ts.reset(); while (ts.incrementToken()) { String word = ts.getAttribute(CharTermAttribute.class).toString(); if (word.length() > 0) cipheredWords.add(new WordKey(crypto.encryptWordKey(word))); } ts.end(); } finally { ts.close(); } search.addFirstDocuments(crypto.encryptAES(docs[i].getName().getBytes()), cipheredWords); storage.putDoc("" + i, crypto.encryptAES(Utils.serializeObject(new PDocument(docs[i].getName(), content)))); } } catch (Exception e) { e.printStackTrace(); } }
From source file:relevantfile.XmlParser.java
License:Open Source License
/********************************************************************************************/ public static String removeStopWordsAndStem(String input) throws IOException { /*String[] stop_word={"abstract","assert","boolean","break","byte","case","catch","char","class","const","continue" ,"default","do","double","else","enum","extends","final","finally","float","for","goto","if","implements","import","instanceof","int" ,"interface","long","native","new","package","private","protected","public","return","short","static","strictfp","super", "switch","synchronized","this","throw","throws","transient","try","void","volatile","while","false","null","true"};*/ String[] stop_word = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while", "abstract", "as", "base", "bool", "byte", "catch", "checked", "class", "decimal", "delegate", "event", "explicit", "false", "finally", "fixed", "foreach", "implicit", "in", "interface", "internal", "is", "lock", "namespace", "new", "null", "object", "operator", "out", "override", "params", "private", "protected", "public", "readonly", "ref", "sbyte", "sealed", "stackalloc", "string", "this", "throw", "true", "try", "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using", "virtual" }; ArrayList<String> stopWords = new ArrayList<String>(); for (int k = 0; k < stop_word.length; k++) stopWords.add(stop_word[k]);/* www .j av a 2s. c o m*/ TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(input)); tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StandardAnalyzer.STOP_WORDS_SET); tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StopFilter.makeStopSet(Version.LUCENE_46, stopWords)); tokenStream = new PorterStemFilter(tokenStream); StringBuilder sb = new StringBuilder(); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { if (sb.length() > 0) { sb.append(" "); } sb.append(token.toString()); } tokenStream.end(); tokenStream.close(); return sb.toString(); }
From source file:retriever.TermFreq.java
String analyze(String query) throws Exception { StringBuffer buff = new StringBuffer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(query)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase();//w w w . j a v a2 s . c o m buff.append(term).append(" "); } stream.end(); stream.close(); return buff.toString(); }
From source file:ri.trabri.Lucene.java
protected ArrayList<String> geraTokens(String text) throws IOException { TokenStream stream = this.analyzer.tokenStream(null, new StringReader(text)); ArrayList<String> words = new ArrayList<>(); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { //System.out.println(cattr.toString()); words.add(cattr.toString());//w ww . j a v a2 s. c om } stream.end(); stream.close(); return words; }
From source file:se.inera.intyg.webcert.web.service.diagnos.repo.DiagnosRepositoryImpl.java
License:Open Source License
@Override public List<Diagnos> searchDiagnosisByDescription(String searchString, int nbrOfResults) { if (Strings.isNullOrEmpty(searchString)) { return Collections.emptyList(); }/* w ww .j a v a 2 s . c o m*/ BooleanQuery query = new BooleanQuery(); try (StandardAnalyzer analyzer = new StandardAnalyzer()) { TokenStream tokenStream = analyzer.tokenStream(DESC, searchString); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = WildcardQuery.WILDCARD_STRING + charTermAttribute.toString() + WildcardQuery.WILDCARD_STRING; query.add(new WildcardQuery(new Term(DESC, term)), BooleanClause.Occur.MUST); } } catch (IOException e) { throw new RuntimeException("IOException occurred in lucene index search", e); } return searchDiagnosisByQuery(query, nbrOfResults); }
From source file:searching.QueryExpansion.java
/** * /*from w ww . jav a 2 s . co m*/ * store frequencies of top docs in maps * * @param text * @param doc_score * @param analyzer * @param reader * @throws IOException */ public void addExpansionDoc(String text, double doc_score, Analyzer analyzer, IndexReader reader) throws IOException { if (actual_pdocs < QueryExpansion.pseudo_rel_docs) { TreeMap<String, Double> map = new TreeMap(); Integer length = 0; Double f; String term; TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text)); //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { f = map.get(term); if (f == null) { map.put(term, 1.0); } else { map.put(term, f + 1.0); } length++; } } ts.end(); } finally { ts.close(); } pdocs[actual_pdocs] = map; pdoc_lengths[actual_pdocs] = length; pdoc_scores[actual_pdocs] = doc_score; //logger.info(observed_bg_mass[iter] + "\t" + (1-observed_bg_prob)); actual_pdocs++; } }
From source file:searching.QueryExpansion.java
/** * calculate positional relevance weights * // w ww. jav a 2 s . c o m * @param query * @param text * @param doc_score * @param analyzer * @param reader * @throws IOException */ public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer, IndexReader reader) throws IOException { //System.out.println(query); //System.out.println(text); if (actual_pdocs < QueryExpansion.pseudo_rel_docs) { TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap(); Integer length = 0; Long pos = 1L; String term; TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text)); ArrayList<Long> qpos; //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { //System.out.print(pos + ":" + term + " "); if (query.contains(term)) { qpos = query_term_pos.get(term); if (qpos == null) { qpos = new ArrayList<>(); } qpos.add(pos); query_term_pos.put(term, qpos); } length++; pos++; } } ts.end(); } finally { ts.close(); } // // All positions collected // now iterate over the document again to get weights // //System.out.println("Doc length" + text.length()); //System.out.println("Positions... "); //System.out.println(query_term_pos.toString()); //System.out.println("END..."); TreeMap<String, Double> map = new TreeMap(); Double f; pos = 1L; double w, w_norm, prob, f0; Double pos_length = 0.0; Double sum_df = (double) reader.getSumDocFreq("text"); double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega) + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega); Double df; double dist; ts = analyzer.tokenStream("myfield", new StringReader(text)); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { prob = 0.0; //f is occurrence w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma); for (String qt : query_term_pos.keySet()) { ArrayList<Long> pos_list = query_term_pos.get(qt); w = 1.0; df = (double) reader.docFreq(new Term("text", qt)); for (Long p : pos_list) { dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma); f0 = Math.exp(-dist); //if (QueryExpansion.method == QueryExpansion.PRM2QTM){ //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df)))); // w += f0; //}else{ w += f0; //} } //System.out.println("weight " + w ); prob += Math.log(w / w_norm); } //System.out.print(pos + "\t" + term + "\t" + Math.exp(prob) + "\n"); /** sum of the probabilities over the positional terms in the documents*/ f = map.get(term); if (f == null) { map.put(term, Math.exp(prob)); } else { map.put(term, f + Math.exp(prob)); } pos_length += Math.exp(prob); pos++; } } ts.end(); } finally { ts.close(); } double sum = 0.0; for (String word : map.keySet()) { //logger.info(word + "\t" + map.get(word)/pos_length); sum += map.get(word) / pos_length; } //logger.info("sum is " + sum); pdocs[actual_pdocs] = map; pdoc_lengths[actual_pdocs] = length; pdoc_positional_lengths[actual_pdocs] = pos_length; pdoc_scores[actual_pdocs] = doc_score; actual_pdocs++; } }
From source file:servlets.TermStatsComparator.java
String analyze(String query) { StringBuffer buff = new StringBuffer(); try {/*from w ww . ja v a2 s .c o m*/ Analyzer analyzer = retriever.getAnalyzer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(query)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); buff.append(term); break; } stream.end(); stream.close(); } catch (Exception ex) { ex.printStackTrace(); return query; } return buff.toString(); }