org.univ.montp2.master.gmin313.DataCrawler.java Source code

Java tutorial

Introduction

Here is the source code for org.univ.montp2.master.gmin313.DataCrawler.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package org.univ.montp2.master.gmin313;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
import weka.core.*;

/**
 *
 * @author marcoooo
 */
public class DataCrawler {

    public static final String crawlResultDir = System.getProperty("user.home") + "/crawl/";

    public static final String workingDir = System.getProperty("user.home") + "/";

    public static void main(String[] args) {
        try {
            //crawlTwitter();
            File crawlDir = new File(crawlResultDir);
            delete(crawlDir);
            crawlDir.mkdir();
            MyCrawler crawler = new MyCrawler();
            crawler.crawlWebSites();
            Instances dataset = createDataset(crawlResultDir);
            java.io.File theFile = new java.io.File(workingDir + "/output/weka.arff");
            System.out.println("Directory : " + theFile.getAbsolutePath());
            FileWriter fw = null;
            fw = new FileWriter(theFile.getAbsolutePath());
            try (BufferedWriter out = new BufferedWriter(fw)) {
                out.write(dataset.toString());
            }
            //System.out.println(dataset)
        } catch (Exception ex) {
            java.util.logging.Logger.getLogger(DataCrawler.class.getName()).log(Level.SEVERE, null, ex);
            System.err.println(ex.getMessage());
            ex.printStackTrace();
        }
    }

    public static void delete(File file) throws IOException {
        System.out.println("File to delete : " + file.getAbsolutePath());
        if (file.isDirectory()) {

            //directory is empty, then delete it
            if (file.list().length == 0) {

                file.delete();
                System.out.println("Directory is deleted : " + file.getAbsolutePath());

            } else {

                //list all the directory contents
                String files[] = file.list();

                for (String temp : files) {
                    //construct the file structure
                    File fileDelete = new File(file, temp);

                    //recursive delete
                    delete(fileDelete);
                }

                //check the directory again, if empty then delete it
                if (file.list().length == 0) {
                    file.delete();
                    System.out.println("Directory is deleted : " + file.getAbsolutePath());
                }
            }

        } else {
            //if file, then delete it
            file.delete();
            System.out.println("File is deleted : " + file.getAbsolutePath());
        }
    }

    public static List<String> getListClassifier(String fileName) {
        FileReader fileReader;
        List<String> lines = new ArrayList<String>();
        try {
            fileReader = new FileReader(workingDir + "/classifier/" + fileName);
            BufferedReader bufferedReader = new BufferedReader(fileReader);

            String line = null;
            while ((line = bufferedReader.readLine()) != null) {
                lines.add(line.trim());
            }
            bufferedReader.close();
        } catch (Exception ex) {
            Logger.getLogger(DataCrawler.class.getName()).log(Level.SEVERE, null, ex);
        }

        return lines;
    }

    public static Instances createDataset(String directoryPath) throws Exception {

        FastVector atts = new FastVector(4);
        atts.addElement(new Attribute("filename", (FastVector) null));
        atts.addElement(new Attribute("title", (FastVector) null));
        atts.addElement(new Attribute("content", (FastVector) null));
        FastVector classes = new FastVector(3);
        classes.addElement("positif");
        classes.addElement("negatif");
        classes.addElement("neutre");
        atts.addElement(new Attribute("class", classes));
        Instances data = new Instances("text_files_in_" + directoryPath, atts, 0);

        File dir = new File(directoryPath);
        List<String> stopWords = getListClassifier("stopw.txt");
        List<String> posWords = getListClassifier("GI_pos_sansNeutre.txt");
        List<String> negWords = getListClassifier("GI_neg_sansNeutre.txt");
        String[] files = dir.list();
        for (int i = 0; i < files.length; i++) {
            if (files[i].endsWith(".txt") && files[i].length() > 0) {
                try {
                    double[] newInst = new double[4];
                    newInst[0] = (double) data.attribute(0).addStringValue(files[i]);
                    File txt = new File(directoryPath + File.separator + files[i]);
                    FileInputStream is = new FileInputStream(txt);
                    int c;
                    DataInputStream in = new DataInputStream(is);
                    BufferedReader br = new BufferedReader(new InputStreamReader(in));
                    String strLine, strValue = "";
                    int score_positif = 0;
                    int score_negatif = 0;
                    for (int j = 1; j < 3; j++) {
                        strLine = br.readLine();
                        System.out.println(strLine);
                        StringTokenizer tokenizer = new StringTokenizer(strLine, "  ,;':%?!");
                        String token;
                        while (tokenizer.hasMoreTokens()) {
                            token = tokenizer.nextToken();
                            //System.out.println("Current Token " + token);
                            if (!stopWords.contains(token.toLowerCase())) {
                                //System.out.println("Added Token " + token);
                                strValue += token.toLowerCase() + " ";
                            }
                            // valcul du score
                            // si positif score
                            if (posWords.contains(token.toLowerCase()))
                                score_positif++;
                            if (negWords.contains(token.toLowerCase()))
                                ;
                            score_negatif++;
                        }
                        newInst[j] = (double) data.attribute(j).addStringValue(strValue);
                    }
                    if (score_positif > score_negatif) {
                        newInst[3] = (double) data.attribute(3).indexOfValue("positif");
                    } else if (score_positif < score_negatif) {
                        newInst[3] = (double) data.attribute(3).indexOfValue("negatif");
                    } else {
                        newInst[3] = (double) data.attribute(3).indexOfValue("neutre");
                    }
                    //newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString());
                    data.add(new Instance(1.0, newInst));
                } catch (Exception e) {
                    System.err.println("failed to convert file: " + directoryPath + File.separator + files[i]);
                }
            }
        }
        return data;
    }
}