Example usage for org.apache.hadoop.mapreduce.lib.output LazyOutputFormat setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output LazyOutputFormat setOutputFormatClass.

Prototype

@SuppressWarnings("unchecked")
public static void setOutputFormatClass(Job job, Class<? extends OutputFormat> theClass)

Source Link

Document

Set the underlying output format for LazyOutputFormat.

Usage

From source file:io.dataapps.chlorine.hadoop.HDFSScanMR.java

License:Apache License

public static Job makeJob(Configuration conf, Path in, Path out, String matchPath, long scanSince,
        String chlorineConfigFilePath, String queue, String maskPath) throws IOException {
    conf.setBoolean("mapred.output.compress", false);
    conf.setLong("scanSince", scanSince);
    conf.set("matchPath", matchPath);
    conf.set("maskPath", maskPath);
    conf.set("inputPath", in.toString());
    if (queue != null) {
        conf.set("mapred.job.queue.name", queue);
    }/*from  w  w  w.  j  a  v a  2s . c om*/
    conf.set("fs.permissions.umask-mode", "007");
    conf.setInt("input_path_depth", in.depth());
    Job job = Job.getInstance(conf, "Chlorine_HDFS_Scan");
    job.setJarByClass(HDFSScanMR.class);
    if (chlorineConfigFilePath != null) {
        try {
            job.addCacheFile(new URI(chlorineConfigFilePath));
            conf.set("finder_file", (new File(chlorineConfigFilePath)).getName());
        } catch (URISyntaxException e) {
            LOG.error(e);
        }
    }
    job.setMapperClass(DeepScanMapper.class);
    job.setNumReduceTasks(0);
    job.setInputFormatClass(TextInputFormat.class);
    TextInputFormat.addInputPath(job, in);
    TextInputFormat.setInputDirRecursive(job, true);
    TextInputFormat.setInputPathFilter(job, NewFilesFilter.class);
    FileOutputFormat.setOutputPath(job, out);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
    return job;
}

From source file:it.crs4.pydoop.mapreduce.pipes.CommandLineParser.java

License:Apache License

public int run(String[] args) throws Exception {
    CommandLineParser cli = new CommandLineParser();
    if (args.length == 0) {
        cli.printUsage();//from www.j a  v a2  s.c om
        return 1;
    }
    try {
        Job job = new Job(new Configuration());
        job.setJobName(getClass().getName());
        Configuration conf = job.getConfiguration();
        CommandLine results = cli.parse(conf, args);
        if (results.hasOption("input")) {
            Path path = new Path(results.getOptionValue("input"));
            FileInputFormat.setInputPaths(job, path);
        }
        if (results.hasOption("output")) {
            Path path = new Path(results.getOptionValue("output"));
            FileOutputFormat.setOutputPath(job, path);
        }
        if (results.hasOption("jar")) {
            job.setJar(results.getOptionValue("jar"));
        }
        if (results.hasOption("inputformat")) {
            explicitInputFormat = true;
            setIsJavaRecordReader(conf, true);
            job.setInputFormatClass(getClass(results, "inputformat", conf, InputFormat.class));
        }
        if (results.hasOption("javareader")) {
            setIsJavaRecordReader(conf, true);
        }
        if (results.hasOption("map")) {
            setIsJavaMapper(conf, true);
            job.setMapperClass(getClass(results, "map", conf, Mapper.class));
        }
        if (results.hasOption("partitioner")) {
            job.setPartitionerClass(getClass(results, "partitioner", conf, Partitioner.class));
        }
        if (results.hasOption("reduce")) {
            setIsJavaReducer(conf, true);
            job.setReducerClass(getClass(results, "reduce", conf, Reducer.class));
        }
        if (results.hasOption("reduces")) {
            job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces")));
        }
        if (results.hasOption("writer")) {
            explicitOutputFormat = true;
            setIsJavaRecordWriter(conf, true);
            job.setOutputFormatClass(getClass(results, "writer", conf, OutputFormat.class));
        }
        if (results.hasOption("lazyOutput")) {
            if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) {
                LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormatClass());
            }
        }
        if (results.hasOption("avroInput")) {
            avroInput = AvroIO.valueOf(results.getOptionValue("avroInput").toUpperCase());
        }
        if (results.hasOption("avroOutput")) {
            avroOutput = AvroIO.valueOf(results.getOptionValue("avroOutput").toUpperCase());
        }

        if (results.hasOption("program")) {
            setExecutable(conf, results.getOptionValue("program"));
        }
        // if they gave us a jar file, include it into the class path
        String jarFile = job.getJar();
        if (jarFile != null) {
            final URL[] urls = new URL[] { FileSystem.getLocal(conf).pathToFile(new Path(jarFile)).toURL() };
            // FindBugs complains that creating a URLClassLoader should be
            // in a doPrivileged() block.
            ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() {
                public ClassLoader run() {
                    return new URLClassLoader(urls);
                }
            });
            conf.setClassLoader(loader);
        }
        setupPipesJob(job);
        return job.waitForCompletion(true) ? 0 : 1;
    } catch (ParseException pe) {
        LOG.info("Error : " + pe);
        cli.printUsage();
        return 1;
    }
}

From source file:nl.cwi.kba2013.apps.chunk_stream_DocExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//  ww  w  . j a  v a 2s.c o m
    String out = null;

    String traintestFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];

            } else if ("-tt".equals(args[i])) {
                traintestFile = args[++i];
                Log.info("TrainTest");
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();

    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "chunk -stream");
    job.setJarByClass(chunk_stream_DocExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);

    job.setMapOutputValueClass(Text.class);
    job.setNumReduceTasks(1);
    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;
    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time", ((double) cputime));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    return status;

}

From source file:nl.cwi.kba2013.apps.FeatureExtractor_DocExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/*from  w w  w . ja v  a  2  s. com*/
    String out = null;
    String queryfile = null;
    String contextFile = null;
    String systemdescription = null;
    String systemdescription_short = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    String annoFile = null;
    String gcldFile = null;
    String labelsFile = null;
    String pprFile = null;
    String myverFile = null;
    String wikiFile = null;
    String traintestFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-a".equals(args[i])) {
                annoFile = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-ds".equals(args[i])) {
                systemdescription_short = args[++i];
            } else if ("-p".equals(args[i])) {
                pprFile = args[++i];
            } else if ("-g".equals(args[i])) {
                gcldFile = args[++i];

            } else if ("-s".equals(args[i])) {
                myverFile = args[++i];

            } else if ("-c".equals(args[i])) {
                contextFile = args[++i];
            } else if ("-w".equals(args[i])) {
                wikiFile = args[++i];
            } else if ("-tt".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";
    if (systemdescription_short == null)
        systemdescription_short = "a two -step classification approach";

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);
    LOG.info(" - runtag: " + runtag);
    LOG.info(" - teamname: " + teamname);
    LOG.info(" - corpus_id: " + corpus_id);
    LOG.info(" - run description: " + systemdescription);

    Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short,
            corpus_id);

    Map<String, String> Attr = new LinkedHashMap<String, String>();
    // Attr.put("trec-kba", "");
    /*
     * Attr.put("LengthTitle", ""); Attr.put("LengthBody", "");
     * Attr.put("LengthAnchor", ""); Attr.put("Source", "");
     * Attr.put("English", ""); Attr.put("MentionsTitle", "");
     * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", "");
     * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread",
     * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", "");
     * Attr.put("SpreadNorm", ""); // Attr.put("Related", "");
     * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", "");
     * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld",
     * ""); Attr.put("partial", ""); Attr.put("s_form", "");
     * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos",
     * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class",
     * "");
     */
    Attr.put("gcld", "0");
    Attr.put("jac", "0");
    Attr.put("cos", "0");
    Attr.put("kl", "0");
    Attr.put("ppr", "0");
    Attr.put("s_form", "0");
    Attr.put("contxR", "0");
    Attr.put("contxL", "0");
    Attr.put("FirstPos", "0");
    Attr.put("LastPos", "0");
    Attr.put("LengthBody", "0");
    Attr.put("FirstPosNorm", "0");
    Attr.put("MentionsBody", "0");
    Attr.put("RelatedBody", "0");
    Attr.put("Spread", "0");
    Attr.put("LastPosNorm", "0");
    Attr.put("SpreadNorm", "0");
    Attr.put("LengthAnchor", "0");
    Attr.put("Source", "0");
    Attr.put("LengthTitle", "0");
    Attr.put("partial", "0");
    Attr.put("MentionsAnchor", "0");
    Attr.put("Relatedtitle", "0");
    Attr.put("English", "0");
    Attr.put("RelatedAnchor", "0");
    Attr.put("MentionsTitle", "0");
    Attr.put("Class", "0");

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());
    conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString());
    conf.set(PPR_HDFS, new Path(pprFile).toUri().toString());
    conf.set(MYVER, new Path(myverFile).toUri().toString());
    conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString());
    conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString());
    conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString());
    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(FeatureExtractor_DocExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());
    /*
    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));
            
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
            
    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);
            
    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
            
    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;
    */

    //job.setInputFormatClass(ThriftFileInputFormat.class);
    //job.setInputFormatClass(TextOutputFormat.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    //job.setMapOutputKeyClass(Text.class);
    //job.setMapOutputValueClass(StreamItemWritable.class);
    //job.setMapOutputValueClass(Text.class);
    //job.setOutputKeyClass(StreamItemWritable.class);

    // job.setCombinerClass(MyReducer.class);
    //job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time", ((double) cputime));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    System.out.println("#" + new Run_info.Factory().toJSON(fr));
    System.out.println("@RELATION" + " trec-kba" + " ");
    for (String key : Attr.keySet()) {
        if (key.equalsIgnoreCase("English")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}");
        } else if (key.equalsIgnoreCase("Class")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1}");
        } else {
            System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC");
        }

    }
    System.out.println("\n@DATA");
    // Text line = new Text();
    // LineReader reader = new LineReader(fs.open(new Path(out
    // + "/part-r-00000")));
    // for (int i = 0; i < num_filter_results; i++) {
    // reader.readLine(line);
    // System.out.println(line.toString().split("\t\t")[1]);
    // }

    System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));

    return status;

}

From source file:nl.cwi.kba2013.apps.KBaDocExtractorFromCleansed.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//from w  w w.  j a  v  a 2  s. com
    String out = null;
    String traintestFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-t".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    Configuration conf = getConf();
    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Annotation Extraction");
    job.setJarByClass(KBaDocExtractorFromCleansed.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);

    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setNumReduceTasks(0);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StreamItemWritable.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();

    return status;

}

From source file:nl.cwi.kba2013.apps.KbaExtractMissingFromRaw.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/*from  w w w . j  a va2s  .c  om*/
    String out = null;
    String queryfile = null;

    String labelsFile = null;

    String traintestFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-t".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());

    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Missing Extractor");
    job.setJarByClass(KbaExtractMissingFromRaw.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());
    //let's see it crushing again

    //job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    //job.setReducerClass(MyReducer.class);
    FileInputFormat.addInputPath(job, new Path(in));
    job.setNumReduceTasks(0);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
    int status = job.waitForCompletion(true) ? 0 : 1;

    return status;

}

From source file:nl.cwi.kba2013.apps.KbaNameVariantMatch.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//from   w ww .  ja  va 2s.  c o  m
    String out = null;
    String queryfile = null;

    String labelsFile = null;

    String traintestFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-t".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());

    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(KbaNameVariantMatch.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    //job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    int status = job.waitForCompletion(true) ? 0 : 1;

    return status;

}

From source file:nl.cwi.kba2013.apps.KbaNameVariantMatchFeatureExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/*from  www  .ja  va 2  s  .  c o m*/
    String out = null;
    String queryfile = null;
    String contextFile = null;
    String systemdescription = null;
    String systemdescription_short = null;
    String corpus_id = null;
    String runtag = null;
    String teamname = null;
    String annoFile = null;
    String gcldFile = null;
    String labelsFile = null;
    String pprFile = null;
    String myverFile = null;
    String wikiFile = null;
    String traintestFile = null;
    HashMap<String, Object> run_info = new HashMap<String, Object>();

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {
                queryfile = args[++i];
            } else if ("-r".equals(args[i])) {
                runtag = args[++i];
            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-a".equals(args[i])) {
                annoFile = args[++i];
            } else if ("-t".equals(args[i])) {
                teamname = args[++i];
            } else if ("-d".equals(args[i])) {
                systemdescription = args[++i];
            } else if ("-ds".equals(args[i])) {
                systemdescription_short = args[++i];
            } else if ("-p".equals(args[i])) {
                pprFile = args[++i];
            } else if ("-g".equals(args[i])) {
                gcldFile = args[++i];

            } else if ("-s".equals(args[i])) {
                myverFile = args[++i];

            } else if ("-c".equals(args[i])) {
                contextFile = args[++i];
            } else if ("-w".equals(args[i])) {
                wikiFile = args[++i];
            } else if ("-tt".equals(args[i])) {
                traintestFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || queryfile == null)
        return printUsage();

    if (runtag == null)
        runtag = "toy_1";

    if (teamname == null)
        teamname = "CompInsights";

    if (corpus_id == null)
        corpus_id = "kba-stream-corpus-2012-cleansed-only";

    if (systemdescription == null)
        systemdescription = "Description intentionally left blank.";
    if (systemdescription_short == null)
        systemdescription_short = "a two -step classification approach";

    //      LOG.info("Tool: " + this.getClass().getName());
    //      LOG.info(" - input path: " + in);
    //      LOG.info(" - output path: " + out);
    //      LOG.info(" - runtag: " + runtag);
    //      LOG.info(" - teamname: " + teamname);
    //      LOG.info(" - corpus_id: " + corpus_id);
    //      LOG.info(" - run description: " + systemdescription);

    Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short,
            corpus_id);

    Map<String, String> Attr = new LinkedHashMap<String, String>();
    // Attr.put("trec-kba", "");
    /*
     * Attr.put("LengthTitle", ""); Attr.put("LengthBody", "");
     * Attr.put("LengthAnchor", ""); Attr.put("Source", "");
     * Attr.put("English", ""); Attr.put("MentionsTitle", "");
     * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", "");
     * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread",
     * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", "");
     * Attr.put("SpreadNorm", ""); // Attr.put("Related", "");
     * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", "");
     * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld",
     * ""); Attr.put("partial", ""); Attr.put("s_form", "");
     * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos",
     * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class",
     * "");
     */
    Attr.put("gcld", "0");
    Attr.put("jac", "0");
    Attr.put("cos", "0");
    Attr.put("kl", "0");
    Attr.put("ppr", "0");
    Attr.put("s_form", "0");
    Attr.put("contxR", "0");
    Attr.put("contxL", "0");
    Attr.put("FirstPos", "0");
    Attr.put("LastPos", "0");
    Attr.put("LengthBody", "0");
    Attr.put("FirstPosNorm", "0");
    Attr.put("MentionsBody", "0");
    Attr.put("RelatedBody", "0");
    Attr.put("Spread", "0");
    Attr.put("LastPosNorm", "0");
    Attr.put("SpreadNorm", "0");
    Attr.put("LengthAnchor", "0");
    Attr.put("Source", "0");
    Attr.put("LengthTitle", "0");
    Attr.put("partial", "0");
    Attr.put("MentionsAnchor", "0");
    Attr.put("Relatedtitle", "0");
    Attr.put("English", "0");
    Attr.put("RelatedAnchor", "0");
    Attr.put("MentionsTitle", "0");
    Attr.put("Class", "0");

    Configuration conf = getConf();
    conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString());
    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());
    conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString());
    conf.set(PPR_HDFS, new Path(pprFile).toUri().toString());
    conf.set(MYVER, new Path(myverFile).toUri().toString());
    conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString());
    conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString());
    conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString());
    conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString());
    conf.set(RUNTAG, runtag);
    conf.set(TEAMNAME, teamname);

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file
    loadTopicData(queryfile, fr, fs, run_info);
    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(KbaNameVariantMatchFeatureExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.
    DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration());
    DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS),
            job.getConfiguration());

    DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());
    /*
    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));
            
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
            
    // job.setCombinerClass(MyReducer.class);
    // job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);
            
    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
            
    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;
    */

    //job.setInputFormatClass(ThriftFileInputFormat.class);
    //job.setInputFormatClass(TextOutputFormat.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    //job.setMapOutputKeyClass(Text.class);
    //job.setMapOutputValueClass(StreamItemWritable.class);
    //job.setMapOutputValueClass(Text.class);
    //job.setOutputKeyClass(StreamItemWritable.class);

    // job.setCombinerClass(MyReducer.class);
    //job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    SequenceFileOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    Counters c = job.getCounters();

    long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue();
    run_info.put("elapsed_time", ((double) cputime));

    long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS)
            .getValue();
    run_info.put("num_filter_results", num_filter_results);

    long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS)
            .getValue();
    run_info.put("num_entity_doc_compares", num_entity_doc_compares);

    long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue();
    run_info.put("num_stream_hours", hours);

    fr.setAdditionalProperties("run_info", run_info);

    System.out.println("#" + new Run_info.Factory().toJSON(fr));
    System.out.println("@RELATION" + " trec-kba" + " ");
    for (String key : Attr.keySet()) {
        if (key.equalsIgnoreCase("English")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}");
        } else if (key.equalsIgnoreCase("Class")) {
            System.out.println("@ATTRIBUTE " + key + " " + "{0,1}");
        } else {
            System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC");
        }

    }
    System.out.println("\n@DATA");
    // Text line = new Text();
    // LineReader reader = new LineReader(fs.open(new Path(out
    // + "/part-r-00000")));
    // for (int i = 0; i < num_filter_results; i++) {
    // reader.readLine(line);
    // System.out.println(line.toString().split("\t\t")[1]);
    // }

    System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#"));

    return status;

}

From source file:nl.cwi.kba2013.apps.KBANameVariantMatchTHERank.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/*w w  w  . j  ava  2  s.  c o m*/
    String out = null;

    String labelsFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];

            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || labelsFile == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();

    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(KBANameVariantMatchTHERank.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    //job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);
    FileInputFormat.addInputPath(job, new Path(in));
    job.setNumReduceTasks(50);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    int status = job.waitForCompletion(true) ? 0 : 1;

    return status;

}

From source file:nl.cwi.kba2013.apps.KBANER.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;// w  ww.j  a v a 2 s. c om
    String out = null;

    String labelsFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];

            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || labelsFile == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();

    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(KBANER.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    //job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);
    FileInputFormat.addInputPath(job, new Path(in));
    job.setNumReduceTasks(0);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    int status = job.waitForCompletion(true) ? 0 : 1;

    return status;

}