di.uniba.it.tri.occ.BuildOccurrence.java Source code

Java tutorial

Introduction

Here is the source code for di.uniba.it.tri.occ.BuildOccurrence.java

Source

/**
 * Copyright (c) 2014, the Temporal Random Indexing AUTHORS.
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * Neither the name of the University of Bari nor the names of its contributors
 * may be used to endorse or promote products derived from this software without
 * specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007
 *
 */
package di.uniba.it.tri.occ;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multiset.Entry;
import di.uniba.it.tri.extractor.Extractor;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;

/**
 *
 * @author pierpaolo
 */
public class BuildOccurrence {

    private int winsize = 5;

    private File outputDir = new File("./");

    private static final Logger logger = Logger.getLogger(BuildOccurrence.class.getName());

    private Extractor extractor;

    public BuildOccurrence() {
    }

    public int getWinsize() {
        return winsize;
    }

    public void setWinsize(int winsize) {
        this.winsize = winsize;
    }

    public File getOutputDir() {
        return outputDir;
    }

    public void setOutputDir(File outputDir) {
        this.outputDir = outputDir;
    }

    public Extractor getExtractor() {
        return extractor;
    }

    public void setExtractor(Extractor extractor) {
        this.extractor = extractor;
    }

    private Map<String, Multiset<String>> count(File startingDir, int year) throws IOException {
        Map<String, Multiset<String>> map = new HashMap<>();
        File[] listFiles = startingDir.listFiles();
        for (File file : listFiles) {
            if (file.getName().endsWith("_" + String.valueOf(year))) {
                logger.log(Level.INFO, "Working file: {0}", file.getName());
                StringReader reader = extractor.extract(file);
                List<String> tokens = getTokens(reader);
                for (int i = 0; i < tokens.size(); i++) {
                    int start = Math.max(0, i - winsize);
                    int end = Math.min(tokens.size() - 1, i + winsize);
                    for (int j = start; j < end; j++) {
                        if (i != j) {
                            Multiset<String> multiset = map.get(tokens.get(i));
                            if (multiset == null) {
                                multiset = HashMultiset.create();
                                map.put(tokens.get(i), multiset);
                            }
                            multiset.add(tokens.get(j));
                        }
                    }
                }
            }
        }
        return map;
    }

    private List<String> getTokens(Reader reader) throws IOException {
        List<String> tokens = new ArrayList<>();
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        TokenStream tokenStream = analyzer.tokenStream("text", reader);
        tokenStream.reset();
        CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
            String token = cattr.toString();
            String[] split = token.split("'");
            if (split.length == 1) {
                tokens.add(token);
            } else {
                int max = 0;
                int index = 0;
                for (int i = 0; i < split.length; i++) {
                    if (split[i].length() > max) {
                        max = split[i].length();
                        index = i;
                    }
                }
                tokens.add(split[index]);
            }
        }
        tokenStream.end();
        return tokens;
    }

    public void process(File startingDir) throws Exception {
        logger.log(Level.INFO, "Starting dir: {0}", startingDir.getAbsolutePath());
        logger.log(Level.INFO, "Output dir: {0}", outputDir.getAbsolutePath());
        logger.log(Level.INFO, "Window size: {0}", winsize);
        File[] listFiles = startingDir.listFiles();
        int minYear = Integer.MAX_VALUE;
        int maxYear = -Integer.MAX_VALUE;
        for (File file : listFiles) {
            int i = file.getName().lastIndexOf("_");
            if (i > -1) {
                int year = Integer.parseInt(file.getName().substring(i + 1));
                if (year < minYear) {
                    minYear = year;
                }
                if (year > maxYear) {
                    maxYear = year;
                }
            }
        }
        logger.log(Level.INFO, "Form year: {0}", minYear);
        logger.log(Level.INFO, "To year: {0}", maxYear);
        for (int k = minYear; k <= maxYear; k++) {
            logger.log(Level.INFO, "Counting year: {0}", k);
            Map<String, Multiset<String>> count = count(startingDir, k);
            if (!count.isEmpty()) {
                save(count, k);
            }
        }
    }

    private void save(Map<String, Multiset<String>> count, int year) throws IOException {
        BufferedWriter writer = new BufferedWriter(new FileWriter(outputDir.getAbsolutePath() + "/count_" + year));
        Iterator<String> keys = count.keySet().iterator();
        while (keys.hasNext()) {
            String key = keys.next();
            writer.append(key);
            Set<Multiset.Entry<String>> entrySet = count.get(key).entrySet();
            for (Entry<String> entry : entrySet) {
                writer.append("\t").append(entry.getElement()).append("\t")
                        .append(String.valueOf(entry.getCount()));
            }
            writer.newLine();
        }
        writer.close();
    }

    /**
     * corpusDir outputDir windowSize extractor_class
     *
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        if (args.length == 4) {
            try {
                Extractor extractor = (Extractor) Class.forName(args[4]).newInstance();
                BuildOccurrence builder = new BuildOccurrence();
                builder.setOutputDir(new File(args[1]));
                builder.setWinsize(Integer.parseInt(args[2]));
                builder.setExtractor(extractor);
                builder.process(new File(args[0]));
            } catch (Exception ex) {
                logger.log(Level.SEVERE, null, ex);
            }
        } else {
            logger.warning("No valid arguments");
        }
    }

}