List of usage examples for org.apache.lucene.misc HighFreqTerms getHighFreqTerms
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field, Comparator<TermStats> comparator) throws Exception
From source file:eu.eexcess.federatedrecommender.decomposer.PseudoRelevanceSourcesDecomposer.java
License:Open Source License
@Override public SecureUserProfile decompose(SecureUserProfileEvaluation inputSecureUserProfile) { FederatedRecommenderCore fCore = null; try {//from ww w . j a va2 s.c o m fCore = FederatedRecommenderCore.getInstance(null); } catch (FederatedRecommenderException e) { logger.log(Level.SEVERE, "Error getting FederatedRecommenderCore,was perhabs not initialized correctly", e); } Set<String> keywords = new HashSet<String>(); for (ContextKeyword cKeyword : inputSecureUserProfile.contextKeywords) { keywords.add(cKeyword.text); } // tmpSUP.partnerList = inputSecureUserProfile.queryExpansionSourcePartner; List<PartnerBadge> tmpPartnerList = new ArrayList<PartnerBadge>(); for (PartnerBadge partnerBadge : inputSecureUserProfile.partnerList) { tmpPartnerList.add(partnerBadge); } inputSecureUserProfile.partnerList = inputSecureUserProfile.queryExpansionSourcePartner; PartnersFederatedRecommendations pFR = fCore.getPartnersRecommendations(inputSecureUserProfile); inputSecureUserProfile.partnerList = tmpPartnerList; Directory directory = new RAMDirectory(); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_48); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer); IndexWriter writer = null; try { writer = new IndexWriter(directory, config); for (ResultList resultLists : pFR.getResults().values()) { for (Result result : resultLists.results) { addDoc(writer, result.description); addDoc(writer, result.title); } } writer.close(); IndexReader reader = DirectoryReader.open(directory); TermStats[] tStats = null; try { tStats = HighFreqTerms.getHighFreqTerms(reader, 20, "content", new DocFreqComparator()); } catch (Exception e) { logger.log(Level.SEVERE, "Could not open HighFreqTerms", e); } finally { reader.close(); } if (tStats != null) { for (TermStats termStats : tStats) { String utf8String = termStats.termtext.utf8ToString(); if (utf8String.length() > 4) if (!checkHighFreqTermsQuery(utf8String.toLowerCase(), keywords)) if (keywords.add(utf8String.toLowerCase())) { inputSecureUserProfile.contextKeywords.add(new ContextKeyword(utf8String, termStats.docFreq / 100.0, ExpansionType.EXPANSION)); } } } else logger.log(Level.SEVERE, "TermStats was null!"); } catch (IOException e) { logger.log(Level.SEVERE, "There was and error writing/reading the Index", e); } logger.log(Level.INFO, "Source Expansion: " + keywords.toString() + " Partners: " + inputSecureUserProfile.queryExpansionSourcePartner); return inputSecureUserProfile; }
From source file:eu.eexcess.federatedrecommender.evaluation.schloett.SchloettQueryExtraction.java
License:Open Source License
private static List<Interest> getKeyWordsFromHistoryLinks( HashMap<String, LinkedHashMap<String, Object>> hashMap, Object taskId) { Directory dir = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(); IndexWriter writer = null;/*from w w w.ja v a2 s. co m*/ if (hashMap != null) for (String keyset : hashMap.keySet()) { LinkedHashMap<String, Object> linkedHashMap = hashMap.get(keyset); if (linkedHashMap.get("task_id").equals(taskId)) if (linkedHashMap != null) { Object urlObject = linkedHashMap.get("url"); if (urlObject != null) if (!urlObject.toString().contains("http://de.wikipedia.org/wiki")) { URL url = null; IndexReader reader = null; try { reader = DirectoryReader.open(dir); } catch (IOException e4) { } IndexSearcher searcher = null; if (reader != null) searcher = new IndexSearcher(reader); TopDocs docs = null; if (searcher != null) { try { docs = searcher.search(new TermQuery(new Term("url", urlObject.toString())), 1); } catch (IOException e4) { // TODO Auto-generated catch block e4.printStackTrace(); } } if (docs != null && docs.totalHits > 0) { } else { try { // System.out // .println(urlObject.toString()); url = new URL(urlObject.toString()); } catch (MalformedURLException e3) { e3.printStackTrace(); } try { reader.close(); } catch (Exception e3) { } InputStream input = null; if (url != null) { try { input = url.openStream(); } catch (IOException e2) { // TODO Auto-generated catch block // System.out.println(e2); } if (input != null) { LinkContentHandler linkHandler = new LinkContentHandler(); BodyContentHandler textHandler = new BodyContentHandler( 10 * 1024 * 1024); ToHTMLContentHandler toHTMLHandler = new ToHTMLContentHandler(); TeeContentHandler teeHandler = new TeeContentHandler(linkHandler, textHandler, toHTMLHandler); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); HtmlParser parser = new HtmlParser(); try { parser.parse(input, teeHandler, metadata, parseContext); } catch (IOException | SAXException | TikaException e1) { System.out.println(urlObject.toString()); e1.printStackTrace(); } String string = textHandler.toString(); String docString = " "; String tagged = tagger.tagString(string.toLowerCase()); Pattern pattern = Pattern.compile("\\s\\w+(_NN|_NNS)"); Matcher matcher = pattern.matcher(tagged); while (matcher.find()) { // System.out // .println("macht: "+matcher.group()); if (!blackList.contains(matcher.group().replaceAll("_NN|_NNS", ""))) docString += matcher.group().replaceAll("_NN|_NNS", " ") + " "; } // System.out.println("#######"); // System.out.println(docString); // for (String string2 : // docString.split("\\s")) { // if(string2.length()>1) // System.out // .print("\""+string2+"\","); // } // System.out.println("#######"); Document doc = new Document(); doc.add(new TextField("content", docString, Store.YES)); doc.add(new StringField("url", urlObject.toString(), Store.YES)); try { IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_4_10_0, analyzer); writer = new IndexWriter(dir, config); writer.addDocument(doc); writer.close(); input.close(); } catch (IOException e) { // TODO Auto-generated catch // block e.printStackTrace(); } } } } } } } IndexReader reader = null; try { reader = DirectoryReader.open(dir); } catch (Exception e1) { // TODO Auto-generated catch block System.out.println(e1); } TermStats[] tStats = null; if (reader != null) try { tStats = HighFreqTerms.getHighFreqTerms(reader, 30, "content", new DocFreqComparator()); } catch (Exception e) { System.out.println(e); } finally { try { reader.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } List<Interest> keywordList = new ArrayList<Interest>(); System.out.println("Extraction: "); if (tStats != null) { for (TermStats termStats : tStats) { String utf8String = termStats.termtext.utf8ToString(); if (!blackList.contains(utf8String.toLowerCase())) { // System.out.println(docString); // for (String string2 : docString.split("\\s")) { // if(string2.length()>1) // System.out // .print("\""+string2+"\","); // } // System.out.println("#######"); System.out.print("\"" + utf8String.toLowerCase() + "\","); keywordList.add(new Interest(utf8String.toLowerCase())); // System.out.println(utf8String.toLowerCase() + " docFreq " // + termStats.docFreq + " TermFreq " // + termStats.totalTermFreq + " "+tagged); } } } System.out.println(); return keywordList; }
From source file:eu.eexcess.sourceselection.redde.indexer.topterm.IndexHelper.java
License:Apache License
/** * returns the top n (to-startFrom+1) terms beginning with startFrom * /*from w w w . j a va 2 s .c o m*/ * @param startFrom * first term * @param to * lastTerm * @return * @throws Exception */ protected String[] getTopTerms(int startFrom, int to) throws Exception { int numTerms = to - startFrom + 1; String[] termNames = null; TermStats[] terms = HighFreqTerms.getHighFreqTerms(inIndexReader, to + 1, fieldOfInterest, new HighFreqTerms.DocFreqComparator()); termNames = new String[numTerms]; int idx = 0; for (TermStats term : terms) { String termDetails = term.toString(); int startIndex = termDetails.lastIndexOf(termVariableName) + termVariableName.length(); int endIndex = termDetails.indexOf(" ", startIndex); termNames[idx++] = termDetails.substring(startIndex, endIndex); if (idx >= numTerms) { break; } } return termNames; }
From source file:luceneindexcreator.LuceneIndexCreator.java
public static void main(String[] args) { try {// w w w. j a va 2 s .c o m Comparator<TermStats> comparator = new Comparator<TermStats>() { @Override public int compare(TermStats t1, TermStats t2) { return t1.totalTermFreq < t2.totalTermFreq ? -1 : 1; }; }; LuceneIndexCreator lw = new LuceneIndexCreator(INDEX_PATH, JSON_FILE_PATH_WEEKLY); lw.createIndex(); //Check the index has been created successfully Directory indexDirectory = FSDirectory.open(new File(INDEX_PATH)); IndexReader indexReader = DirectoryReader.open(indexDirectory); int numDocs = indexReader.numDocs(); /* Keywords SORTED BY DATE * //generation of Date indexes and the associated json files of keyword freq * ArrayList<String> indexedDates = new ArrayList<String>(); * for ( int i = 0; i < numDocs; i++){ * Document document = indexReader.document(i); * //indexRader.toString(i); * String date = document.get("Date"); * if (!contains(indexedDates, date)) { * LuceneIndexCreator lwd = new LuceneIndexCreator(PARENT_INDEX_PATH + date, JSON_FILE_PATH_WEEKLY); * lwd.createSubindexDate(date); * indexedDates.add(date); * } * Directory indexDirectoryDate = FSDirectory.open(new File(PARENT_INDEX_PATH + date)); * IndexReader indexReaderDate = DirectoryReader.open(indexDirectoryDate); * HighFreqTerms hTerms = new HighFreqTerms(); * JSONArray termResultJSONArray = new JSONArray(); * TermStats[] hTermResult = hTerms.getHighFreqTerms(indexReaderDate, 50, "content", comparator); * //creating json object * for (int j = 0; j < hTermResult.length; j++) { * JSONObject termResultJSON = new JSONObject(); * termResultJSON.put("Term", hTermResult[j].termtext.utf8ToString()); * termResultJSON.put("Frequency", hTermResult[j].totalTermFreq); * termResultJSONArray.add(termResultJSON); * //System.out.println("" + hTermResult[i].termtext.utf8ToString() + " " + hTermResult[i].totalTermFreq); * } * //outputting json * try(FileWriter file = new FileWriter("JSONResults/" + date + ".json")) { * file.write(termResultJSONArray.toJSONString()); * System.out.println("Successfully Copied JSON Object to File..."); * System.out.println("\nJSON Object: " + termResultJSONArray ); * * } * //date = date.substring(5, 16).trim(); * //System.out.println( "d=" + document.get("content")); * //System.out.println("date: " + date + "."); * } */ // keywords sorted by week //generation of Date indexes and the associated json files of keyword freq ArrayList<String> indexedWeeks = new ArrayList<String>(); //creating subindexes for each week for (int i = 0; i < numDocs; i++) { Document document = indexReader.document(i); //System.out.println(document.get("Week_number")); //System.out.println(document.get("Date")); String weekNum = document.get("Week_number"); //System.out.println(weekNum); if (!contains(indexedWeeks, weekNum)) { LuceneIndexCreator lww = new LuceneIndexCreator(PARENT_INDEX_PATH + "week" + weekNum, JSON_FILE_PATH_WEEKLY); lww.createSubindexWeek(weekNum); indexedWeeks.add(weekNum); } } JSONArray json1 = new JSONArray(); for (String weekNum : indexedWeeks) { Directory indexDirectoryWeek = FSDirectory.open(new File(PARENT_INDEX_PATH + "week" + weekNum)); IndexReader indexReaderWeek = DirectoryReader.open(indexDirectoryWeek); HighFreqTerms hTerms = new HighFreqTerms(); TermStats[] hTermResult = hTerms.getHighFreqTerms(indexReaderWeek, 100, "content", comparator); //creating json object JSONObject json2 = new JSONObject(); json2.put("Week", weekNum); JSONArray json3 = new JSONArray(); for (int j = 0; j < hTermResult.length; j++) { JSONObject json4 = new JSONObject(); json4.put("Term", hTermResult[j].termtext.utf8ToString()); json4.put("Frequency", hTermResult[j].totalTermFreq); json3.add(json4); } json2.put("Terms", json3); json1.add(json2); } //output json try (FileWriter file = new FileWriter("JSONResults/allWeeklyTerms.json")) { file.write(json1.toJSONString()); System.out.println("Successfully Copied JSON Object to File..."); System.out.println("\nJSON Object: " + json1); } // gets term freq for all docs HighFreqTerms hTerms = new HighFreqTerms(); JSONArray termResultJSONArray = new JSONArray(); //array of termStats TermStats[] hTermResult = hTerms.getHighFreqTerms(indexReader, 150, "content", comparator); //creating json object for (int i = 0; i < hTermResult.length; i++) { JSONObject termResultJSON = new JSONObject(); termResultJSON.put("Term", hTermResult[i].termtext.utf8ToString()); termResultJSON.put("Frequency", hTermResult[i].totalTermFreq); termResultJSONArray.add(termResultJSON); //System.out.println("" + hTermResult[i].termtext.utf8ToString() + " " + hTermResult[i].totalTermFreq); } //outputting json try (FileWriter file = new FileWriter("JSONResults/allTermFreq.json")) { file.write(termResultJSONArray.toJSONString()); System.out.println("Successfully Copied JSON Object to File..."); System.out.println("\nJSON Object: " + termResultJSONArray); } } catch (Exception e) { e.printStackTrace(); } }
From source file:servlets.TermStatsComparator.java
/** * Processes requests for both HTTP <code>GET</code> and <code>POST</code> * methods.//from ww w . ja v a2s . c o m * * @param request servlet request * @param response servlet response * @throws ServletException if a servlet-specific error occurs * @throws IOException if an I/O error occurs */ protected void processRequest(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setContentType("text/html;charset=UTF-8"); try (PrintWriter out = response.getWriter()) { /* TODO output your page here. You may use following sample code. */ IndexReader reader = retriever.getReader(); String term = request.getParameter("term"); if (isNumber(term)) { TermStats[] termStats = null; try { termStats = HighFreqTerms.getHighFreqTerms(reader, Integer.parseInt(term), TrecDocRetriever.FIELD_ANALYZED_CONTENT, new TermStatsComparator()); } catch (Exception ex) { out.println("Error in obtaining term stats"); } if (termStats == null) out.println("Error in obtaining term stats"); StringBuffer responseBuff = new StringBuffer("<table><tbody>"); responseBuff.append("<tr>").append("<th>").append("Term").append("</th>").append("<th>") .append("Doc Freq").append("</th>").append("<th>").append("Coll Freq").append("</th>") .append("</tr>"); for (TermStats ts : termStats) { responseBuff.append("<tr>").append("<td>").append(ts.termtext.utf8ToString()).append("</td>") .append("<td>").append(ts.docFreq).append("</td>").append("<td>") .append(ts.totalTermFreq).append("</td>").append("</tr>"); } responseBuff.append("</tbody></table>"); out.println(responseBuff.toString()); } else { String analyzedTerm = analyze(term); Term t = new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, analyzedTerm); int docFreq = reader.docFreq(t); long collFreq = reader.totalTermFreq(t); out.println("Doc freq: " + docFreq + " " + "Coll Freq: " + collFreq); } } }