Example usage for org.apache.lucene.misc HighFreqTerms getHighFreqTerms

List of usage examples for org.apache.lucene.misc HighFreqTerms getHighFreqTerms

Introduction

In this page you can find the example usage for org.apache.lucene.misc HighFreqTerms getHighFreqTerms.

Prototype

public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field,
        Comparator<TermStats> comparator) throws Exception 

Source Link

Document

Returns TermStats[] ordered by the specified comparator

Usage

From source file:eu.eexcess.federatedrecommender.decomposer.PseudoRelevanceSourcesDecomposer.java

License:Open Source License

@Override
public SecureUserProfile decompose(SecureUserProfileEvaluation inputSecureUserProfile) {
    FederatedRecommenderCore fCore = null;

    try {//from  ww w . j  a  va2  s.c o  m
        fCore = FederatedRecommenderCore.getInstance(null);
    } catch (FederatedRecommenderException e) {
        logger.log(Level.SEVERE, "Error getting FederatedRecommenderCore,was perhabs not initialized correctly",
                e);
    }
    Set<String> keywords = new HashSet<String>();
    for (ContextKeyword cKeyword : inputSecureUserProfile.contextKeywords) {
        keywords.add(cKeyword.text);
    }
    //   tmpSUP.partnerList = inputSecureUserProfile.queryExpansionSourcePartner;
    List<PartnerBadge> tmpPartnerList = new ArrayList<PartnerBadge>();
    for (PartnerBadge partnerBadge : inputSecureUserProfile.partnerList) {
        tmpPartnerList.add(partnerBadge);
    }
    inputSecureUserProfile.partnerList = inputSecureUserProfile.queryExpansionSourcePartner;
    PartnersFederatedRecommendations pFR = fCore.getPartnersRecommendations(inputSecureUserProfile);
    inputSecureUserProfile.partnerList = tmpPartnerList;

    Directory directory = new RAMDirectory();

    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_48);
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer);
    IndexWriter writer = null;

    try {
        writer = new IndexWriter(directory, config);
        for (ResultList resultLists : pFR.getResults().values()) {
            for (Result result : resultLists.results) {
                addDoc(writer, result.description);
                addDoc(writer, result.title);
            }
        }

        writer.close();

        IndexReader reader = DirectoryReader.open(directory);
        TermStats[] tStats = null;
        try {
            tStats = HighFreqTerms.getHighFreqTerms(reader, 20, "content", new DocFreqComparator());
        } catch (Exception e) {
            logger.log(Level.SEVERE, "Could not open HighFreqTerms", e);
        } finally {
            reader.close();
        }
        if (tStats != null) {
            for (TermStats termStats : tStats) {
                String utf8String = termStats.termtext.utf8ToString();
                if (utf8String.length() > 4)
                    if (!checkHighFreqTermsQuery(utf8String.toLowerCase(), keywords))
                        if (keywords.add(utf8String.toLowerCase())) {
                            inputSecureUserProfile.contextKeywords.add(new ContextKeyword(utf8String,
                                    termStats.docFreq / 100.0, ExpansionType.EXPANSION));
                        }
            }
        } else
            logger.log(Level.SEVERE, "TermStats was null!");
    } catch (IOException e) {
        logger.log(Level.SEVERE, "There was and error writing/reading the Index", e);
    }

    logger.log(Level.INFO, "Source   Expansion: " + keywords.toString() + " Partners: "
            + inputSecureUserProfile.queryExpansionSourcePartner);
    return inputSecureUserProfile;
}

From source file:eu.eexcess.federatedrecommender.evaluation.schloett.SchloettQueryExtraction.java

License:Open Source License

private static List<Interest> getKeyWordsFromHistoryLinks(
        HashMap<String, LinkedHashMap<String, Object>> hashMap, Object taskId) {

    Directory dir = new RAMDirectory();
    Analyzer analyzer = new StandardAnalyzer();

    IndexWriter writer = null;/*from  w w  w.ja  v a2  s.  co  m*/

    if (hashMap != null)
        for (String keyset : hashMap.keySet()) {
            LinkedHashMap<String, Object> linkedHashMap = hashMap.get(keyset);
            if (linkedHashMap.get("task_id").equals(taskId))
                if (linkedHashMap != null) {
                    Object urlObject = linkedHashMap.get("url");
                    if (urlObject != null)
                        if (!urlObject.toString().contains("http://de.wikipedia.org/wiki")) {

                            URL url = null;
                            IndexReader reader = null;
                            try {
                                reader = DirectoryReader.open(dir);
                            } catch (IOException e4) {
                            }
                            IndexSearcher searcher = null;
                            if (reader != null)
                                searcher = new IndexSearcher(reader);
                            TopDocs docs = null;
                            if (searcher != null) {

                                try {
                                    docs = searcher.search(new TermQuery(new Term("url", urlObject.toString())),
                                            1);
                                } catch (IOException e4) {
                                    // TODO Auto-generated catch block
                                    e4.printStackTrace();
                                }
                            }
                            if (docs != null && docs.totalHits > 0) {

                            } else {

                                try {
                                    // System.out
                                    // .println(urlObject.toString());
                                    url = new URL(urlObject.toString());
                                } catch (MalformedURLException e3) {
                                    e3.printStackTrace();
                                }

                                try {
                                    reader.close();
                                } catch (Exception e3) {
                                }
                                InputStream input = null;
                                if (url != null) {
                                    try {

                                        input = url.openStream();
                                    } catch (IOException e2) {
                                        // TODO Auto-generated catch block
                                        // System.out.println(e2);
                                    }
                                    if (input != null) {
                                        LinkContentHandler linkHandler = new LinkContentHandler();
                                        BodyContentHandler textHandler = new BodyContentHandler(
                                                10 * 1024 * 1024);
                                        ToHTMLContentHandler toHTMLHandler = new ToHTMLContentHandler();
                                        TeeContentHandler teeHandler = new TeeContentHandler(linkHandler,
                                                textHandler, toHTMLHandler);
                                        Metadata metadata = new Metadata();
                                        ParseContext parseContext = new ParseContext();
                                        HtmlParser parser = new HtmlParser();

                                        try {
                                            parser.parse(input, teeHandler, metadata, parseContext);
                                        } catch (IOException | SAXException | TikaException e1) {
                                            System.out.println(urlObject.toString());
                                            e1.printStackTrace();
                                        }
                                        String string = textHandler.toString();
                                        String docString = " ";

                                        String tagged = tagger.tagString(string.toLowerCase());
                                        Pattern pattern = Pattern.compile("\\s\\w+(_NN|_NNS)");
                                        Matcher matcher = pattern.matcher(tagged);
                                        while (matcher.find()) {
                                            // System.out
                                            // .println("macht: "+matcher.group());
                                            if (!blackList.contains(matcher.group().replaceAll("_NN|_NNS", "")))
                                                docString += matcher.group().replaceAll("_NN|_NNS", " ") + " ";
                                        }

                                        // System.out.println("#######");
                                        // System.out.println(docString);
                                        // for (String string2 :
                                        // docString.split("\\s")) {
                                        // if(string2.length()>1)
                                        // System.out
                                        // .print("\""+string2+"\",");
                                        // }
                                        // System.out.println("#######");
                                        Document doc = new Document();

                                        doc.add(new TextField("content", docString, Store.YES));
                                        doc.add(new StringField("url", urlObject.toString(), Store.YES));

                                        try {
                                            IndexWriterConfig config = new IndexWriterConfig(
                                                    Version.LUCENE_4_10_0, analyzer);
                                            writer = new IndexWriter(dir, config);
                                            writer.addDocument(doc);
                                            writer.close();
                                            input.close();
                                        } catch (IOException e) {
                                            // TODO Auto-generated catch
                                            // block
                                            e.printStackTrace();
                                        }

                                    }
                                }
                            }
                        }

                }
        }

    IndexReader reader = null;
    try {
        reader = DirectoryReader.open(dir);
    } catch (Exception e1) {
        // TODO Auto-generated catch block
        System.out.println(e1);
    }
    TermStats[] tStats = null;
    if (reader != null)
        try {
            tStats = HighFreqTerms.getHighFreqTerms(reader, 30, "content", new DocFreqComparator());
        } catch (Exception e) {
            System.out.println(e);
        } finally {
            try {
                reader.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    List<Interest> keywordList = new ArrayList<Interest>();
    System.out.println("Extraction: ");
    if (tStats != null) {
        for (TermStats termStats : tStats) {
            String utf8String = termStats.termtext.utf8ToString();
            if (!blackList.contains(utf8String.toLowerCase())) {

                // System.out.println(docString);
                // for (String string2 : docString.split("\\s")) {
                // if(string2.length()>1)
                // System.out
                // .print("\""+string2+"\",");
                // }
                // System.out.println("#######");
                System.out.print("\"" + utf8String.toLowerCase() + "\",");
                keywordList.add(new Interest(utf8String.toLowerCase()));
                // System.out.println(utf8String.toLowerCase() + " docFreq "
                // + termStats.docFreq + " TermFreq "
                // + termStats.totalTermFreq + " "+tagged);
            }

        }
    }
    System.out.println();
    return keywordList;
}

From source file:eu.eexcess.sourceselection.redde.indexer.topterm.IndexHelper.java

License:Apache License

/**
 * returns the top n (to-startFrom+1) terms beginning with startFrom
 * /*from w w  w . j  a  va 2  s .c  o  m*/
 * @param startFrom
 *            first term
 * @param to
 *            lastTerm
 * @return
 * @throws Exception
 */
protected String[] getTopTerms(int startFrom, int to) throws Exception {
    int numTerms = to - startFrom + 1;
    String[] termNames = null;
    TermStats[] terms = HighFreqTerms.getHighFreqTerms(inIndexReader, to + 1, fieldOfInterest,
            new HighFreqTerms.DocFreqComparator());

    termNames = new String[numTerms];
    int idx = 0;
    for (TermStats term : terms) {
        String termDetails = term.toString();
        int startIndex = termDetails.lastIndexOf(termVariableName) + termVariableName.length();
        int endIndex = termDetails.indexOf(" ", startIndex);
        termNames[idx++] = termDetails.substring(startIndex, endIndex);
        if (idx >= numTerms) {
            break;
        }
    }
    return termNames;
}

From source file:luceneindexcreator.LuceneIndexCreator.java

public static void main(String[] args) {
    try {//  w  w  w. j a  va 2 s .c o  m
        Comparator<TermStats> comparator = new Comparator<TermStats>() {
            @Override
            public int compare(TermStats t1, TermStats t2) {
                return t1.totalTermFreq < t2.totalTermFreq ? -1 : 1;
            };
        };

        LuceneIndexCreator lw = new LuceneIndexCreator(INDEX_PATH, JSON_FILE_PATH_WEEKLY);
        lw.createIndex();

        //Check the index has been created successfully
        Directory indexDirectory = FSDirectory.open(new File(INDEX_PATH));
        IndexReader indexReader = DirectoryReader.open(indexDirectory);

        int numDocs = indexReader.numDocs();
        /* Keywords SORTED BY DATE
         *      //generation of Date indexes and the associated json files of keyword freq            
         *      ArrayList<String> indexedDates = new ArrayList<String>();
         *      for ( int i = 0; i < numDocs; i++){
         *          Document document = indexReader.document(i);
         *          //indexRader.toString(i);
         *          String date = document.get("Date");
         *          if (!contains(indexedDates, date)) {
         *              LuceneIndexCreator lwd = new LuceneIndexCreator(PARENT_INDEX_PATH + date, JSON_FILE_PATH_WEEKLY);
         *              lwd.createSubindexDate(date);
         *              indexedDates.add(date);
         *          }
         *          Directory indexDirectoryDate = FSDirectory.open(new File(PARENT_INDEX_PATH + date));
         *          IndexReader indexReaderDate = DirectoryReader.open(indexDirectoryDate);
         *          HighFreqTerms hTerms = new HighFreqTerms();
         *          JSONArray termResultJSONArray = new JSONArray();
         *          TermStats[] hTermResult = hTerms.getHighFreqTerms(indexReaderDate, 50, "content", comparator);
         *          //creating json object
         *          for (int j = 0; j < hTermResult.length; j++) {
         *              JSONObject termResultJSON = new JSONObject();
         *              termResultJSON.put("Term", hTermResult[j].termtext.utf8ToString());
         *              termResultJSON.put("Frequency", hTermResult[j].totalTermFreq);
         *              termResultJSONArray.add(termResultJSON);
         *              //System.out.println("" + hTermResult[i].termtext.utf8ToString() + " " +  hTermResult[i].totalTermFreq);
         *          }
         *          //outputting json
         *          try(FileWriter file = new FileWriter("JSONResults/" + date + ".json")) {
         *              file.write(termResultJSONArray.toJSONString());
         *              System.out.println("Successfully Copied JSON Object to File...");
         *              System.out.println("\nJSON Object: " + termResultJSONArray );
         *
         *          }
         *              //date = date.substring(5, 16).trim();
         *              //System.out.println( "d=" + document.get("content"));
         *              //System.out.println("date: " + date + ".");
         *      }
        */

        // keywords sorted by week
        //generation of Date indexes and the associated json files of keyword freq                      
        ArrayList<String> indexedWeeks = new ArrayList<String>();

        //creating subindexes for each week
        for (int i = 0; i < numDocs; i++) {
            Document document = indexReader.document(i);
            //System.out.println(document.get("Week_number"));
            //System.out.println(document.get("Date"));
            String weekNum = document.get("Week_number");
            //System.out.println(weekNum);
            if (!contains(indexedWeeks, weekNum)) {
                LuceneIndexCreator lww = new LuceneIndexCreator(PARENT_INDEX_PATH + "week" + weekNum,
                        JSON_FILE_PATH_WEEKLY);
                lww.createSubindexWeek(weekNum);
                indexedWeeks.add(weekNum);
            }
        }
        JSONArray json1 = new JSONArray();
        for (String weekNum : indexedWeeks) {
            Directory indexDirectoryWeek = FSDirectory.open(new File(PARENT_INDEX_PATH + "week" + weekNum));
            IndexReader indexReaderWeek = DirectoryReader.open(indexDirectoryWeek);
            HighFreqTerms hTerms = new HighFreqTerms();
            TermStats[] hTermResult = hTerms.getHighFreqTerms(indexReaderWeek, 100, "content", comparator);

            //creating json object 
            JSONObject json2 = new JSONObject();
            json2.put("Week", weekNum);
            JSONArray json3 = new JSONArray();
            for (int j = 0; j < hTermResult.length; j++) {
                JSONObject json4 = new JSONObject();
                json4.put("Term", hTermResult[j].termtext.utf8ToString());
                json4.put("Frequency", hTermResult[j].totalTermFreq);
                json3.add(json4);
            }
            json2.put("Terms", json3);
            json1.add(json2);
        }
        //output json
        try (FileWriter file = new FileWriter("JSONResults/allWeeklyTerms.json")) {
            file.write(json1.toJSONString());
            System.out.println("Successfully Copied JSON Object to File...");
            System.out.println("\nJSON Object: " + json1);
        }

        // gets term freq for all docs 
        HighFreqTerms hTerms = new HighFreqTerms();
        JSONArray termResultJSONArray = new JSONArray();

        //array of termStats
        TermStats[] hTermResult = hTerms.getHighFreqTerms(indexReader, 150, "content", comparator);

        //creating json object
        for (int i = 0; i < hTermResult.length; i++) {
            JSONObject termResultJSON = new JSONObject();
            termResultJSON.put("Term", hTermResult[i].termtext.utf8ToString());
            termResultJSON.put("Frequency", hTermResult[i].totalTermFreq);
            termResultJSONArray.add(termResultJSON);
            //System.out.println("" + hTermResult[i].termtext.utf8ToString() + " " +  hTermResult[i].totalTermFreq);
        }
        //outputting json
        try (FileWriter file = new FileWriter("JSONResults/allTermFreq.json")) {
            file.write(termResultJSONArray.toJSONString());
            System.out.println("Successfully Copied JSON Object to File...");
            System.out.println("\nJSON Object: " + termResultJSONArray);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:servlets.TermStatsComparator.java

/**
 * Processes requests for both HTTP <code>GET</code> and <code>POST</code>
 * methods.//from ww  w  . ja  v  a2s .  c  o m
 *
 * @param request servlet request
 * @param response servlet response
 * @throws ServletException if a servlet-specific error occurs
 * @throws IOException if an I/O error occurs
 */
protected void processRequest(HttpServletRequest request, HttpServletResponse response)
        throws ServletException, IOException {
    response.setContentType("text/html;charset=UTF-8");
    try (PrintWriter out = response.getWriter()) {
        /* TODO output your page here. You may use following sample code. */
        IndexReader reader = retriever.getReader();
        String term = request.getParameter("term");

        if (isNumber(term)) {
            TermStats[] termStats = null;
            try {
                termStats = HighFreqTerms.getHighFreqTerms(reader, Integer.parseInt(term),
                        TrecDocRetriever.FIELD_ANALYZED_CONTENT, new TermStatsComparator());
            } catch (Exception ex) {
                out.println("Error in obtaining term stats");
            }
            if (termStats == null)
                out.println("Error in obtaining term stats");

            StringBuffer responseBuff = new StringBuffer("<table><tbody>");
            responseBuff.append("<tr>").append("<th>").append("Term").append("</th>").append("<th>")
                    .append("Doc Freq").append("</th>").append("<th>").append("Coll Freq").append("</th>")
                    .append("</tr>");

            for (TermStats ts : termStats) {
                responseBuff.append("<tr>").append("<td>").append(ts.termtext.utf8ToString()).append("</td>")
                        .append("<td>").append(ts.docFreq).append("</td>").append("<td>")
                        .append(ts.totalTermFreq).append("</td>").append("</tr>");
            }
            responseBuff.append("</tbody></table>");
            out.println(responseBuff.toString());
        } else {
            String analyzedTerm = analyze(term);
            Term t = new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, analyzedTerm);
            int docFreq = reader.docFreq(t);
            long collFreq = reader.totalTermFreq(t);
            out.println("Doc freq: " + docFreq + "&nbsp;&nbsp;" + "Coll Freq: " + collFreq);
        }
    }
}