de.upb.wdqa.wdvd.FeatureExtractor.java Source code

Java tutorial

Introduction

Here is the source code for de.upb.wdqa.wdvd.FeatureExtractor.java

Source

/*
 * Wikidata Vandalism Detector 2016 (WDVD-2016)
 * 
 * Copyright (c) 2016 Stefan Heindorf, Martin Potthast, Benno Stein, Gregor Engels
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

package de.upb.wdqa.wdvd;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PipedInputStream;
import java.io.PipedOutputStream;
import java.net.InetAddress;
import java.net.URL;
import java.net.UnknownHostException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Enumeration;
import java.util.List;
import java.util.jar.Manifest;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.log4j.Appender;
import org.apache.log4j.AppenderSkeleton;
import org.apache.log4j.AsyncAppender;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.FileAppender;
import org.apache.log4j.Level;
import org.apache.log4j.PatternLayout;
import org.apache.log4j.spi.LoggingEvent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikidata.wdtk.dumpfiles.ExtendedMwRevisionDumpFileProcessor;
import org.wikidata.wdtk.dumpfiles.MwRevisionProcessor;

import de.upb.wdqa.wdvd.db.ItemStore;
import de.upb.wdqa.wdvd.db.MemoryItemStore;
import de.upb.wdqa.wdvd.features.Feature;
import de.upb.wdqa.wdvd.labels.RevertMethod;
import de.upb.wdqa.wdvd.processors.RevisionProcessor;
import de.upb.wdqa.wdvd.processors.controlflow.AsyncProcessor;
import de.upb.wdqa.wdvd.processors.controlflow.ParallelProcessor;
import de.upb.wdqa.wdvd.processors.controlflow.TeeProcessor;
import de.upb.wdqa.wdvd.processors.decorators.CorpusLabelProcessor;
import de.upb.wdqa.wdvd.processors.decorators.FeatureProcessor;
import de.upb.wdqa.wdvd.processors.decorators.GeolocationDbProcessor;
import de.upb.wdqa.wdvd.processors.decorators.GeolocationFeatureProcessor;
import de.upb.wdqa.wdvd.processors.decorators.GroupProcessor;
import de.upb.wdqa.wdvd.processors.decorators.JsonProcessor;
import de.upb.wdqa.wdvd.processors.decorators.JsonProcessorReducer;
import de.upb.wdqa.wdvd.processors.decorators.PageProcessor;
import de.upb.wdqa.wdvd.processors.decorators.TagDownloaderProcessor;
import de.upb.wdqa.wdvd.processors.decorators.TextRegexProcessor;
import de.upb.wdqa.wdvd.processors.filters.SamplingFilterProcessor;
import de.upb.wdqa.wdvd.processors.output.CsvFeatureWriter;
import de.upb.wdqa.wdvd.processors.preprocessing.RawConverterProcessor;
import de.upb.wdqa.wdvd.processors.statistics.ActionStatisticsProcessor;
import de.upb.wdqa.wdvd.processors.statistics.CorpusStatisticsProcessor;
import de.upb.wdqa.wdvd.processors.statistics.LabelingStatisticsProcessor;
import de.upb.wdqa.wdvd.processors.statistics.RawDumpStatisticsProcessor;
import de.upb.wdqa.wdvd.processors.statistics.UserStatisticsProcessor;

public class FeatureExtractor {
    static final Logger logger = LoggerFactory.getLogger(FeatureExtractor.class);

    static final boolean PROCESSOR_GEOLOCATION_ENABLED = true;
    static final boolean PROCESSOR_REVISION_TAGS_ENABLED = true;
    static final boolean PROCESSOR_FEATURE_LANGUAGE_PROPORTION_ENABLED = true;

    static final boolean ALL_FEATURES_ENABLED = true;

    static final int PROCESSOR_FEATURE_LANGUAGE_PROPORTION_THREADS = 12;
    static final int PROCESSOR_JSON_THREADS = 4;

    static final double LOW_QUALITY_SAMPLING_RATE = 1.0;
    static final double HIGH_QUALITY_SAMPLING_RATE = 1.0;

    static final Level LOG_LEVEL = Level.INFO;

    static final int BUFFER_SIZE = 256 * 1024 * 1024;

    static String runTime;

    private FeatureExtractor() {

    }

    public static void main(String[] args) throws IOException {
        System.exit(main2(args));
    }

    public static int main2(String[] args) {
        int result = 0;

        try {
            FeatureExtractorConfiguration config = new FeatureExtractorConfiguration(args);

            File featureFile = config.getFeatureFile();
            initLogger(featureFile.getAbsoluteFile() + ".log");
            logConfiguration(config);

            executePipeline(config);

            logger.info("Feature Extraction finished!");

            if (ErrorFlagAppender.hasErrorOccured()) {
                result = 1;

                logger.error("##################################################");
                logger.error("# AN ERROR HAS OCCURED DURING FEATURE EXTRACTION #");
                logger.error("##################################################");
            }

        } catch (Throwable t) {
            logger.error("", t);
            result = 1;
        } finally {
            try {
                closeLogger();
            } catch (Throwable t) {
                System.err.println(t);
            }
        }

        return result;
    }

    public static void executePipeline(FeatureExtractorConfiguration config) {
        try {
            //ItemStore itemStore = new SQLItemStore();
            ItemStore itemStore = new MemoryItemStore();

            List<Feature> features = FeatureList.getFeatures(ALL_FEATURES_ENABLED);

            RevisionProcessor nextProcessor = new CsvFeatureWriter(config.getFeatureFile(), features);

            for (Feature feature : features) {
                List<Feature> featureList = new ArrayList<Feature>();

                featureList.add(feature);

                FeatureProcessor fp = new FeatureProcessor(nextProcessor, featureList);
                nextProcessor = new AsyncProcessor(fp, getFeatureString(featureList), 32);
            }

            nextProcessor = new AsyncProcessor(nextProcessor, "features", 1024);

            RevisionProcessor samplingRollbackProcessor = new SamplingFilterProcessor(nextProcessor,
                    LOW_QUALITY_SAMPLING_RATE, HIGH_QUALITY_SAMPLING_RATE, RevertMethod.ROLLBACK);

            String name = "all";
            RevisionProcessor wholeCorpusStatisticsProcessor = new CorpusStatisticsProcessor(null, name, itemStore);

            TeeProcessor teeProcessor = new TeeProcessor();
            teeProcessor.add(wholeCorpusStatisticsProcessor);
            teeProcessor.add(samplingRollbackProcessor);
            //      teeProcessor.add(samplingDownloadedSHA1Processor);
            //      teeProcessor.add(samplingSHA1Processor);

            nextProcessor = new ActionStatisticsProcessor(teeProcessor,
                    config.getFeatureFile().getAbsoluteFile() + "_monthlyActionDistribution.csv");

            nextProcessor = new LabelingStatisticsProcessor(nextProcessor);

            nextProcessor = new UserStatisticsProcessor(nextProcessor);

            nextProcessor = new GroupProcessor(nextProcessor);

            if (config.getLabelFile() != null) {
                nextProcessor = new CorpusLabelProcessor(nextProcessor, config.getLabelFile());
            } else {
                logger.info("Labels are disabled.");
            }

            nextProcessor = new PageProcessor(nextProcessor, itemStore);

            List<RevisionProcessor> parallelProcessorList = new ArrayList<RevisionProcessor>();
            for (int i = 0; i < PROCESSOR_FEATURE_LANGUAGE_PROPORTION_THREADS; i++) {
                RevisionProcessor textRegexProcessor = new TextRegexProcessor(
                        PROCESSOR_FEATURE_LANGUAGE_PROPORTION_ENABLED);
                parallelProcessorList.add(textRegexProcessor);
            }
            nextProcessor = new ParallelProcessor(parallelProcessorList, null, nextProcessor, "textRegex");

            parallelProcessorList = new ArrayList<RevisionProcessor>();
            for (int i = 0; i < PROCESSOR_JSON_THREADS; i++) {
                RevisionProcessor jsonProcessor = new JsonProcessor(null, i);
                parallelProcessorList.add(jsonProcessor);
            }
            nextProcessor = new ParallelProcessor(parallelProcessorList, new JsonProcessorReducer(), nextProcessor,
                    "json");

            if (PROCESSOR_REVISION_TAGS_ENABLED && (config.getRevisionTagFile() != null)) {
                nextProcessor = new TagDownloaderProcessor(nextProcessor, config.getRevisionTagFile());
            } else {
                logger.info("Revision tags are disabled.");
            }

            if (PROCESSOR_GEOLOCATION_ENABLED) {
                if (config.getGeolocationDbFile() != null) {
                    nextProcessor = new GeolocationDbProcessor(nextProcessor, config.getGeolocationDbFile());
                } else if (config.getGeolocationFeatureFile() != null) {
                    nextProcessor = new GeolocationFeatureProcessor(nextProcessor,
                            config.getGeolocationFeatureFile());
                }
            } else {
                logger.info("Geolocation database is disabled.");
            }

            nextProcessor = new AsyncProcessor(nextProcessor, "initial", 1024);

            MwRevisionProcessor nextMwProcessor = new RawConverterProcessor(nextProcessor);

            nextMwProcessor = new RawDumpStatisticsProcessor(nextMwProcessor);

            ExtendedMwRevisionDumpFileProcessor dumpFileProcessor = new ExtendedMwRevisionDumpFileProcessor(
                    nextMwProcessor);

            //      dumpFileProcessor.processDumpFileContents(dumpFile.getDumpFileStream(), dumpFile);

            // one thread retrieves the data and the other thread decompresses it.
            InputStream uncompressedStream = getUncompressedStream(
                    (getPipedDumpFileStream(getCompressedDumpFileStream(config.getRevisionFile()))));

            dumpFileProcessor.processDumpFileContents(uncompressedStream);

            uncompressedStream.close();

        } catch (IOException e) {
            logger.error("", e);
        }
    }

    private static InputStream getCompressedDumpFileStream(File file) throws IOException {
        InputStream fileInputStream = new FileInputStream(file);

        InputStream bufferedInputStream = new BufferedInputStream(fileInputStream);

        return bufferedInputStream;
    }

    private static InputStream getPipedDumpFileStream(final InputStream inputStream) throws IOException {
        final PipedOutputStream pipedOutputStream = new PipedOutputStream();
        final PipedInputStream pipedInputStream = new PipedInputStream(pipedOutputStream, BUFFER_SIZE);

        new Thread("Dump File Reader") {
            @Override
            public void run() {
                try {
                    IOUtils.copy(inputStream, pipedOutputStream);

                    inputStream.close();
                    pipedOutputStream.close();
                } catch (Throwable t) {
                    logger.error("", t);
                }
            }
        }.start();

        return pipedInputStream;
    }

    // Decompresses the input stream in a new thread
    private static InputStream getUncompressedStream(final InputStream inputStream) throws IOException {
        // the decompression is a major bottleneck, make sure that it does not
        // have to wait for the buffer to empty
        final PipedOutputStream pipedOutputStream = new PipedOutputStream();
        final PipedInputStream pipedInputStream = new PipedInputStream(pipedOutputStream, BUFFER_SIZE);

        new Thread("Dump File Decompressor") {
            @Override
            public void run() {
                try {
                    InputStream compressorInputStream = new BZip2CompressorInputStream(inputStream);

                    IOUtils.copy(compressorInputStream, pipedOutputStream);

                    compressorInputStream.close();
                    pipedOutputStream.close();
                } catch (IOException e) {
                    logger.error("", e);
                }
            }
        }.start();

        return pipedInputStream;
    }

    private static void initLogger(String filename) {
        final String PATTERN = "[%d{yyyy-MM-dd HH:mm:ss}] [%-5p] [%c{1}] %m%n";

        // Stores whether an error has occured
        AppenderSkeleton errorFlagAppender = new ErrorFlagAppender();
        errorFlagAppender.setThreshold(Level.ERROR);
        errorFlagAppender.activateOptions();
        org.apache.log4j.Logger.getRootLogger().addAppender(errorFlagAppender);

        ConsoleAppender consoleAppender = new ConsoleAppender();
        consoleAppender.setEncoding("UTF-8");
        consoleAppender.setLayout(new PatternLayout(PATTERN));
        consoleAppender.setThreshold(LOG_LEVEL);
        consoleAppender.activateOptions();
        AsyncAppender asyncConsoleAppender = new AsyncAppender();
        asyncConsoleAppender.addAppender(consoleAppender);
        asyncConsoleAppender.setBufferSize(1024);
        asyncConsoleAppender.activateOptions();
        org.apache.log4j.Logger.getRootLogger().addAppender(asyncConsoleAppender);

        FileAppender fileAppender = new FileAppender();
        fileAppender.setEncoding("UTF-8");
        fileAppender.setFile(filename);
        fileAppender.setLayout(new PatternLayout(PATTERN));
        fileAppender.setThreshold(LOG_LEVEL);
        fileAppender.setAppend(false);
        fileAppender.activateOptions();
        AsyncAppender asyncFileAppender = new AsyncAppender();
        asyncFileAppender.addAppender(fileAppender);
        asyncFileAppender.setBufferSize(1024);
        asyncFileAppender.activateOptions();
        org.apache.log4j.Logger.getRootLogger().addAppender(asyncFileAppender);

    }

    static class ErrorFlagAppender extends AppenderSkeleton {
        static boolean hasErrorOccured = false;

        public static boolean hasErrorOccured() {
            return hasErrorOccured;
        }

        @Override
        public void close() {
        }

        @Override
        public boolean requiresLayout() {
            return false;
        }

        @Override
        protected void append(LoggingEvent arg0) {
            hasErrorOccured = true;
        }
    }

    private static void logConfiguration(FeatureExtractorConfiguration config) {
        //      System.getProperties().list(System.out);

        //      for(Package p: Package.getPackages()){
        //         System.out.println(p);
        //      }

        if (logger.isInfoEnabled()) {
            // Host and operating system
            logger.info("Host name: " + getHostName());
            logger.info("Operating system: " + System.getProperty("os.name"));

            // Java
            logger.info("java.home: " + System.getProperty("java.home"));
            logger.info("java.version: " + System.getProperty("java.version"));
            logger.info("java.runtime.name: " + System.getProperty("java.runtime.name"));
            logger.info("java.runtime.version: " + System.getProperty("java.runtime.version"));
            logger.info("java.vm.name: " + System.getProperty("java.vm.name"));
            logger.info("java.vm.version: " + System.getProperty("java.vm.version"));
            logger.info("java.vm.vendor: " + System.getProperty("java.vm.vendor"));

            // Feature Extraction
            logger.info("Filename of JAR: " + getJarFile());
            logger.info(
                    "Implementation version: " + FeatureExtractor.class.getPackage().getImplementationVersion());
            logger.info("Build time: " + getBuildTime());
            logger.info("Run time: " + getRunTime());

            // Configuration
            logger.info("Revision file: " + config.getRevisionFile());
            logger.info("Label file: " + config.getLabelFile());
            logger.info("Feature file: " + config.getFeatureFile());
            logger.info("Revision tag file: " + config.getRevisionTagFile());
            logger.info("Geolocation database file: " + config.getGeolocationDbFile());
            logger.info("Geolocation feature file: " + config.getGeolocationFeatureFile());
            logger.info("Geolocation enabled: " + PROCESSOR_GEOLOCATION_ENABLED);
            logger.info("TagDownloader enabled: " + PROCESSOR_REVISION_TAGS_ENABLED);
            logger.info("Feature languageProportion enabled: " + PROCESSOR_FEATURE_LANGUAGE_PROPORTION_ENABLED);
        }
    }

    private static void closeLogger() {
        org.apache.log4j.LogManager.shutdown();
        Enumeration<?> e = org.apache.log4j.Logger.getRootLogger().getAllAppenders();
        while (e.hasMoreElements()) {
            Appender appender = (Appender) e.nextElement();
            appender.close();
        }
    }

    private static String getHostName() {
        String result = null;
        try {
            result = InetAddress.getLocalHost().getHostName();
        } catch (UnknownHostException e) {
        }

        return result;
    }

    private static String getBuildTime() {
        String result = null;
        try {
            Enumeration<URL> resources;
            resources = FeatureExtractor.class.getClassLoader().getResources("META-INF/MANIFEST.MF");

            //      while (resources.hasMoreElements()) {
            Manifest manifest = new Manifest(resources.nextElement().openStream());
            // check that this is your manifest and do what you need or get the next one

            result = manifest.getMainAttributes().getValue("Build-Time");

        } catch (IOException e) {
        }

        return result;
    }

    private static String getRunTime() {

        if (runTime == null) {
            Calendar cal = Calendar.getInstance();
            SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmm");
            runTime = sdf.format(cal.getTime());
        }

        return runTime;
    }

    private static File getJarFile() {
        File result = new java.io.File(
                FeatureExtractor.class.getProtectionDomain().getCodeSource().getLocation().getPath());

        return result;
    }

    private static String getFeatureString(List<Feature> features) {
        StringBuilder featureString = new StringBuilder();
        for (Feature feature : features) {
            featureString.append(feature.getName() + ",");
        }

        return featureString.toString();
    }

}