expansionBlocks.ProcessCommunities.java Source code

Java tutorial

Introduction

Here is the source code for expansionBlocks.ProcessCommunities.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package expansionBlocks;

import DEFS.Definitions;
import Parameters.Parameters;
import Strings.Tokenize.Tokenizer;
import Structures.MapUtils;
import Structures.Pair;
import com.ibm.icu.text.Normalizer;
import configurations.Configuration;
import edu.upc.dama.dex.core.Graph;
import edu.upc.dama.utils.objects.UtilsMap;
import static expansionBlocks.ProcessCommunities.selectBestCommunities;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import model.Article;
import model.Category;
import model.Entity;
import model.Query;
import org.apache.commons.collections.CollectionUtils;
import scaleCommunities.AbstractCommunityScalator;
import utils.StringUtilsQueryExpansion;

/**
 *
 * @author joan
 */
public class ProcessCommunities {

    public static Pair<Map<Entity, Double>, Map<Entity, Double>> execute(Configuration configuration, Query query)
            throws Exception {
        Map<Set<Long>, Map<Entity, Double>> mapPathCommunities = query.getCommunities();
        HashSet<Map<Entity, Double>> initialCommunities = new HashSet<>(mapPathCommunities.values());

        Set<Map<Entity, Double>> scaledCommunities = new HashSet<>();

        AbstractCommunityScalator as = configuration.getAbstractCommunityScalator();

        for (Map<Entity, Double> community : initialCommunities) {
            Map<Entity, Double> scaledCommunity = as.scaledEmphasisArticlesInCommunity(configuration, query,
                    community);
            scaledCommunities.add(scaledCommunity);
        }

        Set<Map<Entity, Double>> communitiesFusioned = getCommunitiesFromCommunitiesBasedOnSimilarity(
                scaledCommunities, configuration.getFusionThreshold());
        if (configuration.DEBUG_INFO) {
            println("Fusion communities based on similarity communities: ");
            for (Map<Entity, Double> community : communitiesFusioned) {
                println(community);

            }
        }
        println(initialCommunities.size() + " communities have been fusioned into " + communitiesFusioned.size());

        println("[[WARNING]] - Select best community algorithm seems to differ from select best path. You may want to double ckeck it.");
        Set<Map<Entity, Double>> selectBestCommunities = selectBestCommunities(configuration, communitiesFusioned,
                query.getTokenNames());

        if (configuration.DEBUG_INFO) {
            println("Selected best communities: ");
            for (Map<Entity, Double> community : selectBestCommunities) {
                println(StringUtilsQueryExpansion.MapDoubleValueToString(community));
            }
        }

        Map<Entity, Double> result = agregateCommunities(selectBestCommunities);

        if (configuration.DEBUG_INFO) {
            println("Agragated community(size: " + result.size() + "): ");
            println(StringUtilsQueryExpansion.MapDoubleValueToString(result));
        }

        Set<Entity> entitiesToRemove = new HashSet<>();
        /*for (Map.Entry<Entity, Double> e : result.entrySet())
         {
         Set<Category> categories = e.getKey().getCategories();
         println("Categories of \"" + e.getKey() + "\": " + categories);
         if (categories.isEmpty())
         entitiesToRemove.add(e.getKey());
         }*/

        entitiesToRemove.addAll(removableAccordingToCategories(result));

        Map<Entity, Double> filteredCommunity = new HashMap<>(result);
        for (Entity e : entitiesToRemove) {
            filteredCommunity.remove(e);
        }
        println("Based on category analisy I would suggest to remove: " + entitiesToRemove);
        println("New Community  in case of category based filtering"
                + StringUtilsQueryExpansion.MapDoubleValueToString(filteredCommunity));

        query.setCommunityAfterRemoval(filteredCommunity);
        query.setCommunity(result);
        return new Pair<>(result, filteredCommunity);

    }

    public static Set<Map<Entity, Double>> selectBestCommunities(Configuration configuration,
            Set<Map<Entity, Double>> comSimilarityBased, Set<String> tokens) throws Exception {

        println("Selecting best communities");
        Set<Map<Entity, Double>> best = new HashSet<>();
        double bestRank = 0.0;
        Map<Map<Entity, Double>, Double> results = new HashMap<>();
        for (Map<Entity, Double> set : comSimilarityBased) {
            //double d = calculateSimilarity(tokens, set)/set.size();
            double d = calculateSimilarity(tokens, set, configuration.getLanguage());

            //println("Set " + set + " w: " + d);
            //if (d >= bestRank)
            if (d >= bestRank && d > 0) {
                results.put(set, d);
                if (d == bestRank) {
                    best.add(set);
                } else {
                    best = new HashSet<>();
                    best.add(set);
                    bestRank = d;
                }
            }
            d = 0.0;
        }
        if (configuration.DEBUG_INFO) {
            int i = 1;
            results = UtilsMap.sortByValue(results);
            for (Map.Entry<Map<Entity, Double>, Double> m : results.entrySet()) {
                println("===============");
                println("Community " + i + ": " + m.getValue() + ":"
                        + StringUtilsQueryExpansion.MapDoubleValueToString(m.getKey()));
                println("===============");
                i++;
            }
        }
        return best;
    }

    private static double calculateSimilarity(Set<String> tokens, Map<Entity, Double> set,
            Definitions.LANGUAGE language) throws Exception {
        //println("Tokens:"+tokens);
        //Set<String> tokensSet = new HashSet<String>();
        double d = 0.0;
        Collection<String> intersection;
        ///List<Article> union = new ArrayList<Article>();
        List<String> checkedToken = new ArrayList<String>();
        //println("--------------------");
        for (Map.Entry<Entity, Double> e : set.entrySet()) {
            if (e.getValue() > 0)// Si e.getValue == 0 l'article no forma part de la comunitat 
            {
                Entity entity = (e.getKey());
                //println("Is entity \""+entity.getName()+"\" ambiguous?"+          entity.isAmbiguous());
                Set<String> otherNames = entity.getEntityNames();
                for (String otherName : otherNames) {
                    intersection = CollectionUtils
                            .intersection(Tokenizer.getTokenizedList(otherName, language, false), tokens);
                    if (intersection.size() > 0 && intersection
                            .size() == (Tokenizer.getTokenizedList(otherName, language, false)).size()) {
                        d += intersection.size();
                    }
                }
            }
        }
        //      println("Community weight: " + d + "\n--------------------");
        return d;
    }

    public static void main(String[] args) throws FileNotFoundException, IOException, Exception {
        Parameters parameters = new Parameters("input.cfg");
        Configuration c = new Configuration(parameters);

        Query q = Query.readQueryLine(c,
                "71;colored Volkswagen beetles;Volkswagen beetles in any other color, for example, red, blue, green or yellow; ; ; ; ; ; ; ; ; ; ; ; ;");

        QueryPreProcess.execute(c, q);

        CreatePaths.execute(c, q);

        SelectPaths.execute(c, q);

        CreateCommunitiesFromPaths.execute(c, q);

        execute(c, q);

    }

    private static Set<Map<Entity, Double>> getCommunitiesFromCommunitiesBasedOnSimilarity(
            Set<Map<Entity, Double>> communities, Double fusionThreshold) {
        //Set<Set<Article>> megaCommunities = new HashSet<Set<Article>>(valueSet(communities));
        Set<Map<Entity, Double>> megaCommunities = new HashSet<>();

        for (Map<Entity, Double> community : communities) {
            Set<Entity> communityIds = new HashSet<>(community.keySet());
            Map<Entity, Double> megaCommunity = new HashMap<>(community);
            //if (!megaCommunity.isEmpty())
            {
                for (Map<Entity, Double> community2 : communities) {
                    Set<Entity> communityIds2 = new HashSet<>(community2.keySet());
                    if (!community2.isEmpty()
                            && (compareCommunities(communityIds, communityIds2) >= fusionThreshold)) {
                        //println("Fusionant: "+e.getValue()+" i "+e2.getValue()+" similarity:"+compareCommunities(community, community2)+" ("+threshold+")");
                        megaCommunity = fusion(community, community2);
                    }
                }
            }
            if (!megaCommunity.isEmpty()) {
                megaCommunities.add(megaCommunity);
            }
        }
        return megaCommunities;
    }

    private static Map<Entity, Double> fusion(Map<Entity, Double> c1, Map<Entity, Double> c2) {
        Map<Entity, Double> result = new HashMap<>();
        Map<Entity, Double> c1c = new HashMap<>(c1);
        Map<Entity, Double> c2c = new HashMap<>(c2);
        Set<Long> intersection = new HashSet(CollectionUtils.intersection(c1c.keySet(), c2c.keySet()));
        for (Map.Entry<Entity, Double> e : c1c.entrySet()) {
            Entity id = e.getKey();
            Double d = e.getValue();
            if (intersection.contains(id)) {
                Double d2 = c2c.get(id);
                d = (d + d2) / 2;
                c2c.remove(id);
            }
            result.put(id, d);
        }
        for (Map.Entry<Entity, Double> e : c2c.entrySet()) {
            Entity id = e.getKey();
            Double d = e.getValue();
            result.put(id, d);
        }
        return result;

    }

    private static double compareCommunities(Set<Entity> c1, Set<Entity> c2) {
        Collection intersection = CollectionUtils.intersection(c1, c2);
        double d, d1, d2;
        d1 = ((double) intersection.size()) / ((double) c1.size());
        d2 = ((double) intersection.size()) / ((double) c2.size());
        d = d1 > d2 ? d1 : d2;

        return d;

    }

    private static Map<Entity, Double> agregateCommunities(Set<Map<Entity, Double>> selectBestCommunities) {
        Map<Entity, Double> result = new HashMap<>();
        Map<Entity, List<Double>> communitiesAgregator = new HashMap<>();
        for (Map<Entity, Double> map : selectBestCommunities) {
            for (Map.Entry<Entity, Double> e : map.entrySet()) {
                Entity id = e.getKey();
                Double d = e.getValue();
                List<Double> get = communitiesAgregator.get(id);
                if (get == null) {
                    get = new ArrayList<Double>();
                }
                get.add(d);
                communitiesAgregator.put(id, get);
            }
        }
        for (Map.Entry<Entity, List<Double>> e : communitiesAgregator.entrySet()) {
            Entity id = e.getKey();
            List<Double> dList = e.getValue();
            Double w = 0.0;
            for (Double d : dList) {
                w += d;
            }
            w /= selectBestCommunities.size();
            //println(a+": "+dList+" = "+w);
            if (w > 0.0) {
                result.put(id, w);
            }
        }
        return result;
    }

    private static void println(Object string) {
        System.out.println("   [ProcessCommunities.java]: " + string);
    }

    private static void println() {
        System.out.println("   [ProcessCommunities.java]: ");
    }

    private static void print(Object string) {
        System.out.print("   [ProcessCommunities.java]: " + string);
    }

    private static void print() {
        System.out.print("   [ProcessCommunities.java]: ");
    }

    public static void execute(Configuration configuration, Set<Query> queries) throws Exception {
        for (Query q : queries) {
            Pair<Map<Entity, Double>, Map<Entity, Double>> execute = execute(configuration, q);
            if (configuration.printTopologicalExtension()) {
                printTopologicalExtension(q, execute.getFirst());
            }
        }
    }

    private static void printTopologicalExtension(Query query, Map<Entity, Double> community)
            throws FileNotFoundException, UnsupportedEncodingException {

        File theDir = new File("images");
        if (!theDir.exists()) {
            boolean result = theDir.mkdir();
        }

        PrintWriter writer = FileManagement.Writer
                .getWriter("images/" + query.getId() + "TopologicalCommunity.txt");

        writer.println("strict digraph G{");
        Set<Entity> entitySet = community.keySet();
        Set<Long> categoriesSet = new HashSet<>();
        for (Entity e : entitySet) {
            categoriesSet.addAll(Article.getCategories(e.getId()));
        }
        Map<Long, Double> categoryWeightMap = new HashMap<>();
        Double maxW = 0.0;
        for (Entity entity : entitySet) {
            Long entityID = entity.getId();

            maxW = maxW < community.get(entityID) ? community.get(entityID) : maxW;

            Set<Long> neighbors = Article.getNeighbors(entityID, Graph.EDGES_OUT);
            Collection<Long> intersection = CollectionUtils.intersection(neighbors, entitySet);
            for (Long neighbourID : intersection) {
                if (!Article.isARedirect(entityID) && !Article.isARedirect(neighbourID)) {
                    writer.println(entityID + " -> " + neighbourID + " [color=red];");
                }

            }

            Set<Long> categories = Article.getCategories(entityID);
            for (Long categoryID : categories) {
                writer.println(entityID + " -> " + categoryID + " [color=green];");
                Double w = categoryWeightMap.put(categoryID, community.get(entityID));
                if (w != null && w > community.get(entityID))
                    categoryWeightMap.put(categoryID, w);

            }

            Set<Long> redirects = Article.getRedirections(entityID);
            /*
             for (Long redirectID : redirects)
             {
             if (!Article.isARedirect(articleID))
             {
             }
             }*/

        }
        for (Long categoryID : categoriesSet) {
            Set<Long> neighbors = Category.getNeigbors(categoryID, Graph.EDGES_OUT);
            Collection<Long> intersection = CollectionUtils.intersection(neighbors, categoriesSet);
            for (Long neighbourID : intersection) {
                writer.println(categoryID + " -> " + neighbourID + " [color=blue];");
            }
            neighbors = Category.getNeigbors(categoryID, Graph.EDGES_IN);
            intersection = CollectionUtils.intersection(neighbors, categoriesSet);
            for (Long neighbourID : intersection) {
                writer.println(neighbourID + " -> " + categoryID + " [color=blue];");
            }

        }

        for (Entity entity : entitySet) {
            String title = entity.getName();
            title = Normalizer.normalize(title, Normalizer.NFD);
            title = title.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
            title = title.replaceAll("[.]+", " ");

            //writer.println(id + "[label=\"" + title + "\"];");
            //         String  weight =  new BigDecimal(community.get(id)*10).toPlainString();
            BigDecimal weightDouble = new BigDecimal(2 / maxW * community.get(entity) + .5);
            String weight = weightDouble.toPlainString();
            writer.println(entity + "[label=\"" + title + "\", width=" + weight + ", height=" + weight
                    + " fixedsize=true,style=filled,color=\"#c0c0c0\"];");
        }
        for (Long id : categoriesSet) {
            String title = (new Category(id)).getName();
            title = Normalizer.normalize(title, Normalizer.NFD);
            title = title.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
            title = title.replaceAll("[.]+", " ");

            BigDecimal weightDouble = new BigDecimal(2 / maxW * categoryWeightMap.get(id) + .5);
            String weight = weightDouble.toPlainString();
            writer.println(id + "[label=\"" + title + "\", width=" + weight + ", height=" + weight
                    + " fixedsize=true,style=filled,color=\"#f0f0f0\"];");
        }

        writer.println("}");
        writer.close();

    }

    private static Set<Entity> removableAccordingToCategories(Map<Entity, Double> community) {
        community = MapUtils.sortByValue(community);
        Set<Entity> firstLevel = new HashSet<>();
        double lastSize = 0.0;
        for (Map.Entry<Entity, Double> e : community.entrySet()) {
            if (firstLevel.isEmpty() || lastSize == e.getValue()) {
                firstLevel.add(e.getKey());
                lastSize = e.getValue();
            } else
                break;
        }

        println("First level entities: " + firstLevel);

        Set<Category> firstLevelCategories = new HashSet<>();
        for (Entity e : firstLevel) {
            firstLevelCategories.addAll(e.getCategories());
        }
        println("First Level Categories = " + firstLevelCategories);

        Set<Category> secondLevelCategories = new HashSet<>();
        for (Category c : firstLevelCategories) {
            secondLevelCategories.addAll(Category.getFathers(c));
        }
        println("Second Level Categories = " + secondLevelCategories);

        Set<Category> thirdLevelCategories = new HashSet<>();
        for (Category c : secondLevelCategories) {
            thirdLevelCategories.addAll(Category.getFathers(c));
        }
        println("Third Level Categories = " + thirdLevelCategories);

        Set<Entity> susceptibleToBeRemoved = new HashSet<>();
        for (Map.Entry<Entity, Double> e : community.entrySet()) {
            Entity entity = e.getKey();
            Set<Category> entityCategory = entity.getCategories();
            boolean maintainEntity = false;
            for (Category c : entityCategory) {
                maintainEntity |= (firstLevelCategories.contains(c) || secondLevelCategories.contains(c)
                        || thirdLevelCategories.contains(c) || isSonOf(secondLevelCategories, c)
                        || isGrandSon(thirdLevelCategories, c));
            }
            if (!maintainEntity)
                susceptibleToBeRemoved.add(entity);
        }

        return susceptibleToBeRemoved;

    }

    private static boolean isSonOf(Set<Category> secondLevelCategories, Category c) {
        Set<Category> fathers = Category.getFathers(c);
        Collection intersection = CollectionUtils.intersection(secondLevelCategories, fathers);
        if (intersection.isEmpty())
            return false;
        return true;
    }

    private static boolean isGrandSon(Set<Category> thirdLevelCategories, Category c) {
        Set<Category> fathers = Category.getFathers(c);
        boolean result = false;
        for (Category c2 : fathers) {
            result |= isSonOf(thirdLevelCategories, c2);
        }
        return result;
    }

}