de.tudarmstadt.lt.n2n.hadoop.pipetests.GoogleSyntacticsJob5.java Source code

Introduction

Here is the source code for de.tudarmstadt.lt.n2n.hadoop.pipetests.GoogleSyntacticsJob5.java
Source

/*
 *   Copyright 2012
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.n2n.hadoop.pipetests;

import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;

import jobimtext.holing.extractor.JobimAnnotationExtractor;
import jobimtext.holing.extractor.JobimExtractorConfiguration;
import jobimtext.holing.type.JoBim;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.util.ToolRunner;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.fit.factory.AggregateBuilder;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

import de.tudarmstadt.lt.n2n.annotators.JoBimPrinter;
import de.tudarmstadt.lt.n2n.pipelines.JoBimRelationPipeline;
import de.tudarmstadt.lt.n2n.utilities.SHARED_CONSTANTS;
import de.tudarmstadt.lt.utilities.types.RepeatedSentence;
import de.tudarmstadt.ukp.dkpro.bigdata.hadoop.DkproHadoopDriver;
import de.tudarmstadt.ukp.dkpro.bigdata.hadoop.DkproMapper;
import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.CASWritable;
import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.Text2CASInputFormat;
import de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.Text2CASInputFormat.DocumentTextExtractor;

/**
 * 
 * @author Steffen Remus
 */
public class GoogleSyntacticsJob5 extends DkproHadoopDriver {

    static void print_usage(String message) {
        if (message != null && !"".equals(message))
            System.out.println(message);
        // Usage: HadoopPipe [hadoop-params] input output [num-mappers]

        System.out.format(
                "Usage: ... %s -D%s=<extractor-configuration-file1>,<extractor-configuration-file2>,... input output [num-mappers]  %n",
                GoogleSyntacticsJob5.class.getName(), SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
    }

    public static void main(String[] args) {
        try {
            ToolRunner.run(new Configuration(), new GoogleSyntacticsJob5(), args);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    @Override
    public AnalysisEngineDescription buildMapperEngine(Configuration conf) throws ResourceInitializationException {
        AggregateBuilder builder = new AggregateBuilder();
        // builder.add(AnalysisEngineFactory.createEngineDescription(MetaDataAnnotator.class));
        builder.add(AnalysisEngineFactory.createEngineDescription(
                JoBimRelationPipeline.createGoogleSyntacticsRelationEngine(true/* create_tokens */,
                        true/* create_sentences */, true/* create_dependencies */, true/* create_new_relations */,
                        true/* create_dependency_path */, true/*ignore_nn_relations*/,
                        5/* dependecy_path_maxlength */, false/* create_detailed_output */,
                        null/* extractor_configuration */, null/* output_destination */)));
        return builder.createAggregateDescription();
    }

    @Override
    public AnalysisEngineDescription buildReducerEngine(Configuration job) throws ResourceInitializationException {
        return null;
    }

    @Override
    public Class<?> getInputFormatClass() {
        return Text2CASInputFormat.class;
    }

    @Override
    public void configure(JobConf job) {
        String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
        if (extractorConfigurationFiles == null) {
            extractorConfigurationFiles = StringUtils.join(SHARED_CONSTANTS.DEFAULT_EXTRACTOR_CONFIGURATIONS, ',');
            System.out.format("Extractorconfigurationfile parameter not set. Assuming -D%s=%s %n",
                    SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
            job.set(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS, extractorConfigurationFiles);
        }
        try {
            String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
            for (int i = 0; i < extractorConfigurationFilesArr.length; i++)
                DistributedCache.addFileToClassPath(new Path(extractorConfigurationFilesArr[i]), job);
        } catch (IOException e) {
            e.printStackTrace();
        }
        Text2CASInputFormat.setDocumentTextExtractorClass(job, KeyPlusValueAsDocumentExtractor.class);
        job.setMapperClass(JoBimMapper.class);
        job.setReducerClass(JoBimReducer.class);
        job.setOutputFormat(TextOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        job.setMemoryForMapTask(4096);
        job.setMemoryForReduceTask(4096);
        job.set("mapred.child.java.opts", "-Xmx4096m");
        job.setNumReduceTasks(1); // reset to default
    }

    public static class KeyPlusValueAsDocumentExtractor implements DocumentTextExtractor {
        private static Text _text = new Text();

        @Override
        public Text extractDocumentText(Text key, Text value) {
            _text.set(key.toString() + "\t" + value.toString());
            return _text;
        }
    }

    public static class JoBimMapper extends DkproMapper {
        @Override
        protected Text getOutputKey(Text key, CAS aCAS) {
            return new Text(String.valueOf(aCAS.getDocumentText().hashCode()));
        }
    }

    public static class JoBimReducer extends MapReduceBase
            implements Reducer<Writable, CASWritable, Text, NullWritable> {

        Log logger = LogFactory.getLog(JoBimReducer.class);

        JobimAnnotationExtractor[] _extractors;

        Text _line = new Text();

        @Override
        public void configure(JobConf job) {
            try {
                String extractorConfigurationFiles = job.get(SHARED_CONSTANTS.PARAM_EXTRACTORCONFIGS);
                String[] extractorConfigurationFilesArr = extractorConfigurationFiles.split(",");
                for (int i = 0; i < extractorConfigurationFilesArr.length; i++) {
                    String extractorConfigurationFileName = new File(extractorConfigurationFilesArr[i]).getName();
                    for (Path p : DistributedCache.getLocalCacheFiles(job))
                        if (p.getName().contains(extractorConfigurationFileName))
                            extractorConfigurationFilesArr[i] = p.toString();
                }

                try {
                    _extractors = new JobimAnnotationExtractor[extractorConfigurationFilesArr.length];
                    for (int i = 0; i < extractorConfigurationFilesArr.length; i++)
                        _extractors[i] = JobimExtractorConfiguration
                                .getExtractorFromXmlFile(new File(extractorConfigurationFilesArr[i]).getName());
                } catch (Exception e) {
                    throw new ResourceInitializationException(e);
                }

            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }

        @Override
        public void reduce(Writable key, Iterator<CASWritable> values, OutputCollector<Text, NullWritable> output,
                Reporter reporter) throws IOException {

            while (values.hasNext()) {
                try {
                    JCas aJCas = values.next().getCAS().getJCas();

                    Collection<RepeatedSentence> covering_annotations = JCasUtil.select(aJCas,
                            RepeatedSentence.class);
                    for (RepeatedSentence covering_annotation : covering_annotations) {
                        int repetitions = covering_annotation.getRepetitionCount();
                        for (JoBim jb : JoBimPrinter.getJoBims(covering_annotation, false)) {
                            for (JobimAnnotationExtractor extractor : _extractors) {
                                _line.set(JoBimPrinter.get_concise_string_old_format(jb, covering_annotation,
                                        extractor, repetitions));
                                output.collect(_line, NullWritable.get());
                            }
                        }
                    }
                } catch (CASException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }

    }

}