gnusmail.filters.WordFrequency.java Source code

Java tutorial

Introduction

Here is the source code for gnusmail.filters.WordFrequency.java

Source

/*
 * Copyright 2011 Universidad de Mlaga.  All Rights Reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 3 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 3 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 3 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Universidad de Mlaga, 29071 Malaga, Spain or visit
 * www.uma.es if you need additional information or have any questions.
 * 
 */
package gnusmail.filters;

import gnusmail.Languages.Language;
import gnusmail.core.WordsStore;
import gnusmail.datasource.Document;
import gnusmail.datasource.mailconnection.MailMessage;
import gnusmail.languagefeatures.DocumentTokenizer;
import gnusmail.languagefeatures.LanguageDetection;
import gnusmail.languagefeatures.Token;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.mail.MessagingException;

import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;

/**
 * 
 * @author jmcarmona
 */
public class WordFrequency extends Filter {

    private TreeMap<String, Integer> labelMap;
    private WordsStore ws;
    static List<String> wordsToAnalyze;
    List<Attribute> attList;
    private Map<String, Attribute> attrMap;

    public WordFrequency() {
        labelMap = new TreeMap<String, Integer>();
        ws = new WordsStore();
        attList = new ArrayList<Attribute>();
    }

    @Override
    public String getName() {
        return "WordFrequency";
    }

    @Override
    public List<Attribute> getAttributes() {
        attrMap = new TreeMap<String, Attribute>();
        for (String folder : labelMap.keySet()) { // Esto a wordsfreqency, pero
            ws.getTermFrequencyManager().updateWordCountPorFolder(folder);
            ws.getTermFrequencyManager().setNumberOfDocumentsByFolder(folder, labelMap.get(folder));
        }
        for (String word : ws.getFrequentWords()) {
            FastVector values = new FastVector();
            values.addElement("True");
            values.addElement("False");
            Attribute att = new Attribute(word, values);
            attList.add(att);
            attrMap.put(word, att);
        }
        return attList;
    }

    @Override
    public void updateAttValues(Document doc) {
        String label = doc.getLabel();
        List<Token> tokens = tokenizeDocument(doc);
        ws.addTokenizedString(tokens, label);
        try {
            labelMap.put(label, labelMap.get(label) + 1);
        } catch (NullPointerException e) {
            labelMap.put(label, 1);
        }
    }

    @Override
    public void updateInstance(Instance inst, Document doc) {
        Set<String> stringThisDocument = new HashSet<String>();
        List<Token> tokens = tokenizeDocument(doc);
        for (Token token : tokens) {
            String stemmedForm = token.getStemmedForm();
            if (stemmedForm.length() > 2) {
                stringThisDocument.add(stemmedForm);
            }
        }

        for (Attribute att : attList) {
            if (stringThisDocument.contains(att.name())) { //TODO esto es lento...mejorar. Pondria un getWords en Documnet
                inst.setValue(att, "True");
            } else {
                inst.setValue(att, "False");
            }
        }
    }

    private static List<Token> tokenizeDocument(Document doc) {
        String body = null;
        try {
            String subject = "";
            if (doc instanceof MailMessage) {
                subject = ((MailMessage) doc).getMessage().getSubject();
            }
            String b = doc.getText();
            body = b + subject;
        } catch (MessagingException ex) {
            Logger.getLogger(WordFrequency.class.getName()).log(Level.SEVERE, null, ex);
        }
        Language lang = new LanguageDetection().detectLanguage(body);
        DocumentTokenizer et = new DocumentTokenizer(body, lang);
        return et.tokenize();
    }
}