fr.ericlab.sondy.algo.eventdetection.MABED.java Source code

Java tutorial

Introduction

Here is the source code for fr.ericlab.sondy.algo.eventdetection.MABED.java

Source

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package fr.ericlab.sondy.algo.eventdetection;

import fr.ericlab.sondy.algo.AlgorithmParameter;
import fr.ericlab.sondy.algo.eventdetection.mabed.MABEDTimeInterval;
import fr.ericlab.sondy.algo.eventdetection.mabed.MABEDTopic;
import fr.ericlab.sondy.algo.eventdetection.mabed.MABEDTopicGraph;
import fr.ericlab.sondy.algo.eventdetection.mabed.MABEDTopicList;
import fr.ericlab.sondy.algo.eventdetection.mabed.MABEDWeightedTerm;
import fr.ericlab.sondy.core.DataManipulation;
import fr.ericlab.sondy.core.access.DBAccess;
import fr.ericlab.sondy.core.access.IndexAccess;
import fr.ericlab.sondy.core.access.MentionIndexAccess;
import fr.ericlab.sondy.core.structure.DetectionResult;
import fr.ericlab.sondy.core.structure.TermInfo;
import fr.ericlab.sondy.core.structure.list.TermInfoList;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.logging.Level;
import java.util.logging.Logger;
import javafx.collections.FXCollections;
import javafx.collections.ObservableList;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

////////////////////////////////////////////////////////////////////////////////
//  This file is part of SONDY.                                               //
//                                                                            //
//  SONDY is free software: you can redistribute it and/or modify             //
//  it under the terms of the GNU General Public License as published by      //
//  the Free Software Foundation, either version 3 of the License, or         //
//  (at your option) any later version.                                       //
//                                                                            //
//  SONDY is distributed in the hope that it will be useful,                  //
//  but WITHOUT ANY WARRANTY; without even the implied warranty of            //
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             //
//  GNU General Public License for more details.                              //
//                                                                            //
//  You should have received a copy of the GNU General Public License         //
//  along with SONDY.  If not, see <http://www.gnu.org/licenses/>.            //
////////////////////////////////////////////////////////////////////////////////

/**
 *   @author Adrien GUILLE, Laboratoire ERIC, Universit Lumire Lyon 2
 */
final public class MABED extends EventDetectionAlgorithm {

    double minTermSupport = 0.0;
    double maxTermSupport = 1.0;
    int nbEvents = 10;
    int nbRelatedTerms = 10;

    int[] distribution;
    DataManipulation dataManipulation;

    // algo
    double maximumScore;
    static int _SMOOTH_ = 6;
    static Double _SIGMA_ = 0.7;
    static Double _THETA_ = 0.7;
    static int _MIN_RELATED_WORDS_ = 2;
    public String info;

    // results
    public MABEDTopicList topics;
    public MABEDTopicGraph topicGraph;

    IndexAccess indexAccess;
    DBAccess dbAccess;

    @Override
    public ObservableList<DetectionResult> apply() {
        if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) {
            minTermSupport = Double.parseDouble(parameters.get(0).getValue());
        }
        if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) {
            maxTermSupport = Double.parseDouble(parameters.get(1).getValue());
        }
        if (parameters.get(4).getValue() != null && !parameters.get(4).getValue().equals("")) {
            nbEvents = Integer.parseInt(parameters.get(4).getValue());
        }
        long startNanoTime = System.nanoTime();
        dataManipulation = new DataManipulation();
        distribution = dataManipulation.getDistribution(appVariables);
        indexAccess = new IndexAccess(appVariables);

        // Get basic topics
        MABEDTopicList basicTopics = getSimpleTopics((int) (minTermSupport * appVariables.nbMessages),
                (int) (maxTermSupport * appVariables.nbMessages));
        basicTopics.sort();

        // Get final topics
        int nbFinalTopics = 0;
        int i = 0;
        if (basicTopics.size() > 0) {
            topicGraph = new MABEDTopicGraph(basicTopics.get(0).score, _SIGMA_);
            dbAccess = new DBAccess();
            dbAccess.initialize(appVariables, false);
            while (nbFinalTopics < nbEvents && i < basicTopics.size()) {
                MABEDTopic topic = getRefinedTopic(basicTopics.get(i), nbRelatedTerms);
                if (topic.relatedTerms.size() >= _MIN_RELATED_WORDS_) {
                    nbFinalTopics += topicGraph.addTopic(topic);
                }
                i++;
            }
            mergeRedundantTopics(topicGraph);
            topics = topicGraph.toTopicList();
            results = FXCollections.observableArrayList();
            float intervalDuration = ((float) appVariables.getCurrentDatasetInterval()) / 60;
            for (MABEDTopic t : topics.list) {
                float startDay = (t.I.timeSliceA * intervalDuration) / 24;
                float endDay = (t.I.timeSliceB * intervalDuration) / 24;
                results.add(new DetectionResult(t.mainTerm + " " + t.relatedTermAsList(),
                        formatter.format(startDay) + ";" + formatter.format(endDay)));
            }
            indexAccess.close();
        }
        long endNanoTime = System.nanoTime();
        long elapsedNanoTime = endNanoTime - startNanoTime;
        double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000;
        appVariables.addLogEntry("[event detection] computed MABED, minTermSupport=" + minTermSupport
                + ", maxTermSupport=" + maxTermSupport + ". " + results.size() + " results in "
                + formatter.format(elaspedSecondTime) + "s");
        return results;
    }

    float expectation(int timeSlice, float tmf) {
        return distribution[timeSlice] * (tmf / (float) appVariables.nbMessages);
    }

    float anomaly(float expectation, float beta) {
        return beta - expectation;
    }

    float getErdemCoefficient(float[] ref, float[] comp, int a, int b) {
        float scores1[] = new float[b - a + 1], scores2[] = new float[b - a + 1];
        for (int i = a; i <= b; i++) {
            scores1[i - a] = ref[i];
            scores2[i - a] = comp[i];
        }
        float result = 0;
        float A12 = 0, A1 = 0, A2 = 0;
        for (int i = 2; i < scores1.length; i++) {
            A12 += (scores1[i] - scores1[i - 1]) * (scores2[i] - scores2[i - 1]);
            A1 += (scores1[i] - scores1[i - 1]) * (scores1[i] - scores1[i - 1]);
            A2 += (scores2[i] - scores2[i - 1]) * (scores2[i] - scores2[i - 1]);
        }
        A1 = (float) Math.sqrt(A1 / (scores1.length - 1));
        A2 = (float) Math.sqrt(A2 / (scores1.length - 1));
        result = A12 / ((scores1.length - 1) * A1 * A2);
        return (result + 1) / 2;
    }

    MABEDTopic getRefinedTopic(MABEDTopic simpleTopic, int nbrelatedTerms) {
        MABEDTopic refinedTopic = new MABEDTopic();
        String[] frequentTerms = new String[nbrelatedTerms];
        try {
            StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
            RAMDirectory temporaryIndex = new RAMDirectory();
            IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
            IndexWriter temporaryWriter = new IndexWriter(temporaryIndex, config);
            Document doc = new Document();
            doc.add(new Field("content",
                    dbAccess.getMessagesAsString(appVariables, simpleTopic.mainTerm, simpleTopic.I.timeSliceA,
                            simpleTopic.I.timeSliceB),
                    Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
            temporaryWriter.addDocument(doc);
            temporaryWriter.commit();
            IndexReader temporaryReader = IndexReader.open(temporaryWriter, true);
            TermEnum allTerms = temporaryReader.terms();
            int minFreq = 0;
            TermInfoList termList = new TermInfoList();
            while (allTerms.next()) {
                String term = allTerms.term().text();
                if (!term.equals(simpleTopic.mainTerm) && term.length() > 1 && !appVariables.isStopWord(term)) {
                    int cf = IndexAccess.getTermOccurenceCount(temporaryReader, term);
                    if (cf > minFreq) {
                        termList.addTermInfo(new TermInfo(term, (int) cf));
                        termList.sortList();
                        if (termList.size() > nbrelatedTerms) {
                            termList.removeLast();
                        }
                        minFreq = termList.get(termList.size() - 1).occurence;
                    }
                }
            }
            for (int i = 0; i < termList.size() && i < nbrelatedTerms; i++) {
                frequentTerms[i] = termList.get(i).text;
            }
            temporaryWriter.close();
            temporaryReader.close();
            temporaryIndex.close();

            float ref[] = indexAccess.getTermFrequency(appVariables, simpleTopic.mainTerm);
            float comp[];
            refinedTopic = new MABEDTopic(simpleTopic.mainTerm, simpleTopic.I, simpleTopic.score,
                    simpleTopic.anomaly);
            for (int j = 0; j < nbrelatedTerms && frequentTerms[j] != null; j++) {
                comp = indexAccess.getTermFrequency(appVariables, frequentTerms[j]);
                double w = getErdemCoefficient(ref, comp, simpleTopic.I.timeSliceA, simpleTopic.I.timeSliceB);
                if (w >= _THETA_) {
                    refinedTopic.relatedTerms.add(new MABEDWeightedTerm(frequentTerms[j], w));
                }
            }
        } catch (IOException ex) {
            Logger.getLogger(MABED.class.getName()).log(Level.SEVERE, null, ex);
        }
        return refinedTopic;
    }

    MABEDTopicList getSimpleTopics(int minTermOccur, int maxTermOccur) {
        MABEDTopicList simpleTopics = new MABEDTopicList();
        int m = appVariables.messageSet.nbTimeSlice;
        try {
            IndexReader r = indexAccess.reader;
            TermEnum allTerms = r.terms();
            MentionIndexAccess mentionIndexAccess = new MentionIndexAccess(appVariables);
            while (allTerms.next()) {
                String term = allTerms.term().text();
                if (term.length() > 2 && !appVariables.isStopWord(term)) {
                    TermDocs termDocs = r.termDocs(allTerms.term());
                    float alpha[], beta[];
                    alpha = indexAccess.getTermFrequency(appVariables, termDocs);
                    beta = mentionIndexAccess.getTermFrequency(appVariables, term);
                    float tmf = beta[m];
                    float tgf = alpha[m];
                    if (tgf > minTermOccur && tgf < maxTermOccur) {
                        float expectation;
                        if (_SMOOTH_ > 0) {
                            beta = dataManipulation.getSmoothedTermFrequency(beta, _SMOOTH_);
                        }
                        float scoreSequence[] = new float[m];
                        for (int i = 0; i < m; i++) {
                            expectation = expectation(i, tmf);
                            scoreSequence[i] = anomaly(expectation, beta[i]);
                        }
                        LinkedList<MABEDTimeInterval> I = new LinkedList<>();
                        LinkedList<Float> L = new LinkedList<>();
                        LinkedList<Float> R = new LinkedList<>();
                        ArrayList<Float> anomaly = new ArrayList<>();
                        for (int i = 0; i < m; i++) {
                            anomaly.add(scoreSequence[i] > 0.0 ? scoreSequence[i] : 0);
                            if (scoreSequence[i] > 0) {
                                int k = I.size();
                                float Lk = 0, Rk = DataManipulation.sum(scoreSequence, 0, i);
                                if (i > 0) {
                                    Lk = DataManipulation.sum(scoreSequence, 0, i - 1);
                                }
                                int j = 0;
                                boolean foundJ = false;
                                for (int l = k - 1; l >= 0 && !foundJ; l--) {
                                    if (L.get(l) < Lk) {
                                        foundJ = true;
                                        j = l;
                                    }
                                }
                                if (foundJ && R.get(j) < Rk) {
                                    MABEDTimeInterval Ik = new MABEDTimeInterval(I.get(j).timeSliceA, i);
                                    for (int p = j; p < k; p++) {
                                        I.removeLast();
                                        L.removeLast();
                                        R.removeLast();
                                    }
                                    k = j;
                                    I.add(Ik);
                                    L.add(DataManipulation.sum(scoreSequence, 0, Ik.timeSliceA - 1));
                                    R.add(DataManipulation.sum(scoreSequence, 0, Ik.timeSliceB));
                                } else {
                                    I.add(new MABEDTimeInterval(i, i));
                                    L.add(Lk);
                                    R.add(Rk);
                                }
                            }
                        }
                        if (I.size() > 0) {
                            MABEDTimeInterval maxI = I.get(0);
                            for (MABEDTimeInterval Ii : I) {
                                if (DataManipulation.sum(scoreSequence, Ii.timeSliceA,
                                        Ii.timeSliceB) > DataManipulation.sum(scoreSequence, maxI.timeSliceA,
                                                maxI.timeSliceB)) {
                                    maxI.timeSliceA = Ii.timeSliceA;
                                    maxI.timeSliceB = Ii.timeSliceB;
                                }
                            }
                            double score = DataManipulation.sum(scoreSequence, I.get(0).timeSliceA,
                                    I.get(0).timeSliceB);
                            simpleTopics.add(new MABEDTopic(term, maxI, score, anomaly));
                        }
                    }
                }
            }
            mentionIndexAccess.close();
        } catch (IOException ex) {
            Logger.getLogger(MABED.class.getName()).log(Level.SEVERE, null, ex);
        }
        simpleTopics.sort();
        return simpleTopics;
    }

    void mergeRedundantTopics(MABEDTopicGraph topicGraph) {
        topicGraph.identifyConnectedComponents();
    }

    @Override
    public String getName() {
        return "MABED";
    }

    public MABED() {
        super();
        parameters = FXCollections.observableArrayList(new AlgorithmParameter("minTermSupport", ""),
                new AlgorithmParameter("maxTermSupport", ""), new AlgorithmParameter("theta", ""),
                new AlgorithmParameter("sigma", ""), new AlgorithmParameter("events", ""));
        algoDescription = "Mention-anomaly-based event detection";
    }

    @Override
    public String getReference() {
        return "<li><b>MABED:</b> Guille A. and Favre C. Mention-anomaly-based Event Detection in Twitter, <i>In Proceedings of the 2014 IEEE/ACM International Conference on Advances in Social Network Analysis and Mining</i>, 2014</li>";
    }
}