com.mmiagency.knime.nodes.keyworddensity.util.KeywordDensityHelper.java Source code

Introduction

Here is the source code for com.mmiagency.knime.nodes.keyworddensity.util.KeywordDensityHelper.java
Source

/*
 * ------------------------------------------------------------------------
 * Copyright by MMI Agency, Houston, Texas, USA
 * Website: http://www.mmiagency.com; Contact: 713-929-6900
 *
 * The MMI KNIME Node is Copyright (C) 2015, MMI Agency The KNIME Nodes 
 * are free software: you can redistribute it and/or modify it under the 
 * terms of the GNU General Public License as published by the Free 
 * Software Foundation, either version 3 of the License, or (at your 
 * option) any later version. 
 * 
 * The KNIME Nodes are distributed in the hope that it will be useful, 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of 
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 
 * General Public License for more details. You should have received a 
 * copy of the GNU General Public License along with the KNIME Nodes. If 
 * not, see <http://www.gnu.org/licenses/>.
 * ------------------------------------------------------------------------
 */
package com.mmiagency.knime.nodes.keyworddensity.util;

import java.io.IOException;
import java.io.StringWriter;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.jsoup.Connection;
import org.jsoup.Jsoup;

/**
 * Leverage Jsoup and Lucene to parse HTML content, stop word removal and
 * counting of term frequency.  Term frequency is then used to calculate 
 * percentage of appearance in content.
 * 
 * @author ed
 *
 */
public class KeywordDensityHelper {

    private String m_url;
    private String m_content;
    private int m_total = 0;
    private Map<String, Integer> m_keywordMap = new HashMap<String, Integer>();
    private List<String> m_excludeList = new ArrayList<String>();
    private boolean m_includeMetaKeywords = true;
    private boolean m_includeMetaDescription = true;
    private boolean m_includePageTitle = true;

    public KeywordDensityHelper(final String url, final String content, final String exclude,
            final boolean includeMetaKeywords, final boolean includeMetaDescription,
            final boolean includePageTitle) {
        m_url = url;
        m_content = content;

        if (exclude != null) {
            exclude.replace(",", " ");
            String[] tokens = exclude.split(" ");
            for (String token : tokens) {
                token = token.trim().toLowerCase();
                m_excludeList.add(token);
            }
        }

        m_includeMetaKeywords = includeMetaKeywords;
        m_includeMetaDescription = includeMetaDescription;
        m_includePageTitle = includePageTitle;
    }

    public void execute() throws IOException {

        org.jsoup.nodes.Document jdoc = null;

        // pull content using Jsoup 
        if (m_content != null && !m_content.trim().isEmpty()) {
            jdoc = Jsoup.parse(m_content);
        } else {
            Connection conn = Jsoup.connect(m_url);

            conn.validateTLSCertificates(false);
            conn.followRedirects(true);
            conn.userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0");
            conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
            conn.header("Accept-Language", "en-US,en;q=0.5");
            conn.header("Accept-Encoding", "gzip, deflate");

            conn.execute();
            jdoc = conn.get();
        }

        StringWriter text = new StringWriter();

        if (m_includeMetaKeywords) {
            text.write(jdoc.select("meta[name=keywords]").attr("content"));
            text.write(" ");
        }
        if (m_includeMetaDescription) {
            text.write(jdoc.select("meta[name=description]").attr("content"));
            text.write(" ");
        }
        if (m_includePageTitle) {
            text.write(jdoc.select("title").text());
            text.write(" ");
        }

        text.write(jdoc.select("body").text());

        // analyze content with Lucene
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        Directory directory = new RAMDirectory();
        IndexWriter indexWriter = new IndexWriter(directory, analyzer, MaxFieldLength.LIMITED);

        Document doc = new Document();
        Field textField = new Field("content", text.toString(), Field.Store.YES, Field.Index.ANALYZED,
                TermVector.WITH_POSITIONS_OFFSETS);

        doc.add(textField);

        indexWriter.addDocument(doc);
        indexWriter.commit();
        indexWriter.close();

        IndexReader indexReader = IndexReader.open(directory, true);

        TermFreqVector termFreqVector = null;

        for (int i = 0; i < indexReader.maxDoc(); i++) {
            termFreqVector = indexReader.getTermFreqVector(i, "content");

            String[] terms = termFreqVector.getTerms();
            int[] freqs = termFreqVector.getTermFrequencies();

            for (int n = 0; n < termFreqVector.size(); n++) {
                if (m_excludeList.contains(terms[n])) {
                    continue;
                }
                add(terms[n], freqs[n]);
            }
        }

        indexReader.close();
        directory.close();

        // sort map by value
        sortMap();
    }

    private void add(final String keyword, final int frequency) {
        m_keywordMap.put(keyword, frequency);
        m_total += frequency;
    }

    public Iterator<KeywordDensityRowEntry> iterator() {
        return new KeywordDensityIterator(m_url, m_keywordMap, m_total);
    }

    private void sortMap() {
        ValueComparator vc = new ValueComparator(m_keywordMap);
        TreeMap<String, Integer> sortedMap = new TreeMap<String, Integer>(vc);
        sortedMap.putAll(m_keywordMap);
        m_keywordMap = sortedMap;
    }

    private static class ValueComparator implements Comparator<String> {

        Map<String, Integer> map;

        public ValueComparator(Map<String, Integer> base) {
            this.map = base;
        }

        public int compare(String a, String b) {
            if (map.get(a) >= map.get(b)) {
                return -1;
            } else {
                return 1;
            } // returning 0 would merge keys 
        }
    }

    private class KeywordDensityIterator implements Iterator<KeywordDensityRowEntry> {

        private String url;
        private BigDecimal total;
        private Map<String, Integer> keywordMap;
        private Iterator<Map.Entry<String, Integer>> iterator;
        private NumberFormat percentageFormat = NumberFormat.getPercentInstance();

        public KeywordDensityIterator(String url, Map<String, Integer> map, int total) {
            this.url = url;
            this.keywordMap = map;
            this.total = new BigDecimal(total);
            this.iterator = this.keywordMap.entrySet().iterator();
            this.percentageFormat.setMinimumFractionDigits(2);
        }

        public boolean hasNext() {
            return iterator.hasNext();
        }

        public KeywordDensityRowEntry next() {
            Map.Entry<String, Integer> mapEntry = iterator.next();
            new KeywordDensityRowFactory();
            return new KeywordDensityRowEntry(url, mapEntry.getKey(),
                    new BigDecimal(mapEntry.getValue()).divide(total, 8, RoundingMode.HALF_UP).doubleValue(),
                    mapEntry.getValue());
        }

        public void remove() {
            iterator.remove();
        }
    }
}