Example usage for org.apache.hadoop.mapred FileOutputFormat setCompressOutput

List of usage examples for org.apache.hadoop.mapred FileOutputFormat setCompressOutput

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileOutputFormat setCompressOutput.

Prototype

public static void setCompressOutput(JobConf conf, boolean compress) 

Source Link

Document

Set whether the output of the job is compressed.

Usage

From source file:org.apache.avro.mapred.TestWordCount.java

License:Apache License

@SuppressWarnings("deprecation")
public void testJob() throws Exception {
    JobConf job = new JobConf();
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");

    outputPath.getFileSystem(job).delete(outputPath);
    WordCountUtil.writeLinesFile();/*from w  ww.j  a v a 2 s .  co m*/

    job.setJobName("wordcount");

    AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());

    AvroJob.setMapperClass(job, MapImpl.class);
    AvroJob.setCombinerClass(job, ReduceImpl.class);
    AvroJob.setReducerClass(job, ReduceImpl.class);

    FileInputFormat.setInputPaths(job, new Path(dir + "/in"));
    FileOutputFormat.setOutputPath(job, outputPath);
    FileOutputFormat.setCompressOutput(job, true);

    WordCountUtil.setMeta(job);

    JobClient.runJob(job);

    WordCountUtil.validateCountsFile();
}

From source file:org.apache.avro.mapred.TestWordCountGeneric.java

License:Apache License

@Test
@SuppressWarnings("deprecation")
public void testJob() throws Exception {
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");
    JobConf job = new JobConf();
    try {//from   w w w  .  jav  a  2 s  . co m
        WordCountUtil.writeLinesFile();

        job.setJobName("wordcount");

        AvroJob.setInputGeneric(job, Schema.create(Schema.Type.STRING));
        AvroJob.setOutputGeneric(job, WordCount.SCHEMA$);

        job.setMapperClass(MapImpl.class);
        job.setCombinerClass(ReduceImpl.class);
        job.setReducerClass(ReduceImpl.class);

        FileInputFormat.setInputPaths(job, new Path(dir + "/in"));
        FileOutputFormat.setOutputPath(job, outputPath);
        FileOutputFormat.setCompressOutput(job, true);

        JobClient.runJob(job);

        WordCountUtil.validateCountsFile();
    } finally {
        outputPath.getFileSystem(job).delete(outputPath);
    }
}

From source file:org.apache.avro.mapred.TestWordCountSpecific.java

License:Apache License

@Test
@SuppressWarnings("deprecation")
public void testJob() throws Exception {
    JobConf job = new JobConf();
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");

    try {/*  w w w.j a  v a  2  s .c  om*/
        WordCountUtil.writeLinesFile();

        job.setJobName("wordcount");

        AvroJob.setInputSpecific(job, Schema.create(Schema.Type.STRING));
        AvroJob.setOutputSpecific(job, WordCount.SCHEMA$);

        job.setMapperClass(MapImpl.class);
        job.setCombinerClass(ReduceImpl.class);
        job.setReducerClass(ReduceImpl.class);

        FileInputFormat.setInputPaths(job, new Path(dir + "/in"));
        FileOutputFormat.setOutputPath(job, outputPath);
        FileOutputFormat.setCompressOutput(job, true);

        JobClient.runJob(job);

        WordCountUtil.validateCountsFile();
    } finally {
        outputPath.getFileSystem(job).delete(outputPath);
    }

}

From source file:org.apache.avro.mapred.tether.TetherOutputFormat.java

License:Apache License

/** Enable output compression using the deflate codec and specify its level.*/
public static void setDeflateLevel(JobConf job, int level) {
    FileOutputFormat.setCompressOutput(job, true);
    job.setInt(AvroOutputFormat.DEFLATE_LEVEL_KEY, level);
}

From source file:org.apache.trevni.avro.TestWordCount.java

License:Apache License

public void testOutputFormat() throws Exception {
    JobConf job = new JobConf();

    WordCountUtil wordCountUtil = new WordCountUtil("trevniMapredTest");

    wordCountUtil.writeLinesFile();/*www.  j  ava 2s  .c om*/

    AvroJob.setInputSchema(job, STRING);
    AvroJob.setOutputSchema(job, Pair.getPairSchema(STRING, LONG));

    AvroJob.setMapperClass(job, MapImpl.class);
    AvroJob.setCombinerClass(job, ReduceImpl.class);
    AvroJob.setReducerClass(job, ReduceImpl.class);

    FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in"));
    FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out"));
    FileOutputFormat.setCompressOutput(job, true);

    job.setOutputFormat(AvroTrevniOutputFormat.class);

    JobClient.runJob(job);

    wordCountUtil.validateCountsFile();
}

From source file:org.archive.wayback.hadoop.CDXSort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job.//from  ww  w .  ja va 2 s .  c  o  m
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {

    boolean compressOutput = false;
    boolean dereferenceInputs = false;
    boolean canonicalize = false;
    boolean funkyInput = false;

    JobConf jobConf = new JobConf(getConf(), CDXSort.class);
    jobConf.setJobName("cdxsort");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    List<String> otherArgs = new ArrayList<String>();

    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("--compress-output".equals(args[i])) {
                compressOutput = true;
            } else if ("--funky-input".equals(args[i])) {
                funkyInput = true;
            } else if ("--dereference-inputs".equals(args[i])) {
                dereferenceInputs = true;
            } else if ("--canonicalize".equals(args[i])) {
                canonicalize = true;
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Make sure there are exactly 3 parameters left: split input output
    if (otherArgs.size() != 3) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3.");
        return printUsage();
    }

    String splitPath = otherArgs.get(0);
    String inputPath = otherArgs.get(1);
    String outputPath = otherArgs.get(2);

    // load the split file, find and set the number of reduces
    AlphaPartitioner partitioner = new AlphaPartitioner();
    File localSplitFile = new File(splitPath);
    FileInputStream fis = new FileInputStream(localSplitFile);
    InputStreamReader isr = new InputStreamReader(fis, ByteOp.UTF8);
    BufferedReader bis = new BufferedReader(isr);
    //      try {
    //         partitioner.loadBoundaries(bis);
    //      } catch (IOException except) {
    //         System.err.println("ERROR: Problem loading file " + splitPath);
    //         return printUsage(); // exits
    //      }
    //      jobConf.setNumReduceTasks(partitioner.getNumPartitions());
    //
    //      // copy the split file into the FS, add to the DistributedCache:
    ////      AlphaPartitioner.setPartitionFile(jobConf, localSplitFile);
    //      AlphaPartitioner.setSplitCache(jobConf, localSplitFile);
    //      System.err.println("uploaded split file to FS and DistributedCache");
    //
    //      // Set job configs:
    //      jobConf.setInputFormat(TextInputFormat.class);
    //
    //      jobConf.setOutputFormat(TextOutputFormat.class);
    //      if (canonicalize) {
    //         jobConf.setMapperClass(CDXCanonicalizerMapClass.class);
    //      } else {
    //         jobConf.setMapperClass(CDXMapClass.class);
    //      }
    //      jobConf.setOutputKeyClass(Text.class);
    //      jobConf.setOutputValueClass(Text.class);
    //      jobConf.set("mapred.textoutputformat.separator", " ");
    //      jobConf.setPartitionerClass(AlphaPartitioner.class);

    int inputCount = 0;
    // Set job input:
    if (dereferenceInputs) {

        // SO SLOW... can't add one at a time...
        //         FileReader is2 = new FileReader(new File(inputPath));
        //         BufferedReader bis2 = new BufferedReader(is2);
        //         while (true) {
        //            String line = bis2.readLine();
        //            if (line == null) {
        //               break;
        //            }
        //            FileInputFormat.addInputPath(jobConf, new Path(line));
        //            inputCount++;
        //            System.err.println("Added path(" + inputCount + "): " + line);
        //         }

        // PASS 2:
        //         FileReader is2 = new FileReader(new File(inputPath));
        //         BufferedReader bis2 = new BufferedReader(is2);
        //         ArrayList<String> list = new ArrayList<String>();
        //         
        //         while (true) {
        //            String line = bis2.readLine();
        //            if (line == null) {
        //               break;
        //            }
        //            list.add(line);
        //            inputCount++;
        //         }
        //         Path arr[] = new Path[list.size()];
        //         for(int i=0; i < list.size(); i++) {
        //            arr[i] = new Path(list.get(i));
        //         }
        //         FileInputFormat.setInputPaths(jobConf, arr);

        // PASS 3:
        if (funkyInput) {
            jobConf.setMapperClass(FunkyDeReffingCDXCanonicalizerMapClass.class);
        } else {
            jobConf.setMapperClass(DeReffingCDXCanonicalizerMapClass.class);
        }
        FileInputFormat.setInputPaths(jobConf, new Path(inputPath));
        inputCount = 1;

    } else {
        FileInputFormat.setInputPaths(jobConf, new Path(inputPath));
        inputCount = 1;
    }

    // Set job output:
    FileOutputFormat.setOutputPath(jobConf, new Path(outputPath));

    if (compressOutput) {
        FileOutputFormat.setCompressOutput(jobConf, true);
        FileOutputFormat.setOutputCompressorClass(jobConf, GzipCodec.class);
    }

    //      System.out.println("Running on " + cluster.getTaskTrackers()
    //            + " nodes, processing " + inputCount + " files/directories"
    //            + " into " + outputPath + " with "
    //            + partitioner.getNumPartitions() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:org.terrier.utility.io.HadoopUtility.java

License:Mozilla Public License

/** Utility method to set JobOutputCompression if possible.
 * In general, I find that JobOutputCompression fails for
 * local job trackers, so this code checks the job tracker
 * location first./*from  w w  w.  j  a va2s  . c om*/
 * @param conf JobConf of job.
 * @return true if JobOutputCompression was set.
 */
public static boolean setJobOutputCompression(JobConf conf) {
    if (!conf.get("mapred.job.tracker").equals("local")) {
        FileOutputFormat.setCompressOutput(conf, true);
        FileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
        return true;
    }
    return false;
}

From source file:ronchy.BigramCount.java

License:Apache License

/**
 * Runs this tool./*ww w .java2  s .c  o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = Integer.parseInt(args[2]);
    int reduceTasks = Integer.parseInt(args[3]);

    sLogger.info("Tool: BigramCount");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(BigramCount.class);
    conf.setJobName("BigramCount");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    /**
     *  Note that these must match the Class arguments given in the mapper 
     */
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(outputDir.toUri(), conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:sa.edu.kaust.indexing.DemoCountTrecDocuments.java

License:Apache License

/**
 * Runs this tool./*from w w  w.j a  v  a  2 s . c  o  m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];
    String mappingFile = args[2];

    sLogger.info("input: " + inputPath);
    sLogger.info("output dir: " + outputPath);
    sLogger.info("docno mapping file: " + mappingFile);

    JobConf conf = new JobConf(DemoCountTrecDocuments.class);
    conf.setJobName("DemoCountTrecDocuments");

    conf.setNumReduceTasks(0);

    // Pass in the class name as a String; this is makes the mapper general
    // in being able to load any collection of Indexable objects that has
    // docid/docno mapping specified by a DocnoMapping object
    conf.set("DocnoMappingClass", "edu.umd.cloud9.collection.trec.TrecDocnoMapping");

    // put the mapping file in the distributed cache so each map worker will
    // have it
    //DistributedCache.addCacheFile(new URI(mappingFile), conf);
    if (conf.get("mapred.job.tracker").equals("local")) {
        conf.set("DocnoMappingFile", mappingFile);
    } else {
        DistributedCache.addCacheFile(new URI(mappingFile), conf);
    }

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(TrecDocumentInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper.class);

    // delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);

    return 0;
}

From source file:sa.edu.kaust.twitter.preprocess.spam.RemoveTweetsOfSpamUsers.java

License:Apache License

public static int removeTweetsOfSpamUsers(String inputPath, String outputPath, int numReducers,
        String spamUserListFile, long startID, long endID, String nTweetsFile, Boolean spam) throws Exception {
    sLogger.info("input: " + inputPath);
    sLogger.info("output dir: " + outputPath);
    sLogger.info("spam user list file: " + spamUserListFile);

    JobConf conf = new JobConf(RemoveTweetsOfSpamUsers.class);
    FileSystem fs = FileSystem.get(conf);
    conf.setJobName("RemoveSpamUserTweets");
    conf.setLong("startID", startID);
    conf.setLong("endID", endID);
    conf.setNumReduceTasks(numReducers);
    conf.setBoolean("spam", spam);

    // put the mapping file in the distributed cache so each map worker will
    // have it/*w  ww .jav  a2 s .c  o  m*/
    //DistributedCache.addCacheFile(new URI(mappingFile), conf);

    if (conf.get("mapred.job.tracker").equals("local")) {
        conf.set("SpamUserListFile", spamUserListFile);
    } else {
        DistributedCache.addCacheFile(new URI(spamUserListFile), conf);
    }

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(TweetWritable.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // delete the output directory if it exists already
    //FileSystem.get(conf).delete(new Path(outputPath), true);
    if (fs.exists(new Path(outputPath))) {
        sLogger.info("Output already exists: skipping!");
        return FSProperty.readInt(fs, nTweetsFile);
    }

    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int nonSpamTweets = (int) counters.findCounter(Statistics.NON_SPAM_TWEETS).getCounter();
    FSProperty.writeInt(fs, nTweetsFile, nonSpamTweets);
    sLogger.info("num of non-spam tweets: " + nonSpamTweets);
    return nonSpamTweets;
}