Example usage for org.apache.hadoop.mapred FileOutputFormat setOutputCompressorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileOutputFormat setOutputCompressorClass.

Prototype

public static void setOutputCompressorClass(JobConf conf, Class<? extends CompressionCodec> codecClass)

Source Link

Document

Set the CompressionCodec to be used to compress job outputs.

Usage

From source file:org.apache.flink.batch.connectors.hive.HiveTableOutputFormat.java

License:Apache License

private HivePartitionWriter writerForLocation(String location) throws IOException {
    JobConf clonedConf = new JobConf(jobConf);
    clonedConf.set(OUTDIR, location);//ww  w  .  ja  v a 2  s  . c  o m
    OutputFormat outputFormat;
    try {
        StorageDescriptor sd = hiveTablePartition.getStorageDescriptor();
        Class outputFormatClz = Class.forName(sd.getOutputFormat(), true,
                Thread.currentThread().getContextClassLoader());
        outputFormatClz = HiveFileFormatUtils.getOutputFormatSubstitute(outputFormatClz);
        outputFormat = (OutputFormat) outputFormatClz.newInstance();
    } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
        throw new FlinkRuntimeException("Unable to instantiate the hadoop output format", e);
    }
    ReflectionUtils.setConf(outputFormat, clonedConf);
    OutputCommitter outputCommitter = clonedConf.getOutputCommitter();
    JobContext jobContext = new JobContextImpl(clonedConf, new JobID());
    outputCommitter.setupJob(jobContext);
    final boolean isCompressed = clonedConf.getBoolean(HiveConf.ConfVars.COMPRESSRESULT.varname, false);
    if (isCompressed) {
        String codecStr = clonedConf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATECODEC.varname);
        if (!StringUtils.isNullOrWhitespaceOnly(codecStr)) {
            try {
                Class<? extends CompressionCodec> codec = (Class<? extends CompressionCodec>) Class
                        .forName(codecStr, true, Thread.currentThread().getContextClassLoader());
                FileOutputFormat.setOutputCompressorClass(clonedConf, codec);
            } catch (ClassNotFoundException e) {
                throw new RuntimeException(e);
            }
        }
        String typeStr = clonedConf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE.varname);
        if (!StringUtils.isNullOrWhitespaceOnly(typeStr)) {
            SequenceFile.CompressionType style = SequenceFile.CompressionType.valueOf(typeStr);
            SequenceFileOutputFormat.setOutputCompressionType(clonedConf, style);
        }
    }
    String taskPartition = String.valueOf(clonedConf.getInt("mapreduce.task.partition", -1));
    Path taskPath = FileOutputFormat.getTaskOutputPath(clonedConf, taskPartition);
    FileSinkOperator.RecordWriter recordWriter;
    try {
        recordWriter = HiveFileFormatUtils.getRecordWriter(clonedConf, outputFormat, outputClass, isCompressed,
                tblProperties, taskPath, Reporter.NULL);
    } catch (HiveException e) {
        throw new IOException(e);
    }
    return new HivePartitionWriter(clonedConf, outputFormat, recordWriter, outputCommitter);
}

From source file:org.archive.wayback.hadoop.CDXSort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job.//from w w  w  .  ja v  a2 s. c o  m
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {

    boolean compressOutput = false;
    boolean dereferenceInputs = false;
    boolean canonicalize = false;
    boolean funkyInput = false;

    JobConf jobConf = new JobConf(getConf(), CDXSort.class);
    jobConf.setJobName("cdxsort");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    List<String> otherArgs = new ArrayList<String>();

    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("--compress-output".equals(args[i])) {
                compressOutput = true;
            } else if ("--funky-input".equals(args[i])) {
                funkyInput = true;
            } else if ("--dereference-inputs".equals(args[i])) {
                dereferenceInputs = true;
            } else if ("--canonicalize".equals(args[i])) {
                canonicalize = true;
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Make sure there are exactly 3 parameters left: split input output
    if (otherArgs.size() != 3) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3.");
        return printUsage();
    }

    String splitPath = otherArgs.get(0);
    String inputPath = otherArgs.get(1);
    String outputPath = otherArgs.get(2);

    // load the split file, find and set the number of reduces
    AlphaPartitioner partitioner = new AlphaPartitioner();
    File localSplitFile = new File(splitPath);
    FileInputStream fis = new FileInputStream(localSplitFile);
    InputStreamReader isr = new InputStreamReader(fis, ByteOp.UTF8);
    BufferedReader bis = new BufferedReader(isr);
    //      try {
    //         partitioner.loadBoundaries(bis);
    //      } catch (IOException except) {
    //         System.err.println("ERROR: Problem loading file " + splitPath);
    //         return printUsage(); // exits
    //      }
    //      jobConf.setNumReduceTasks(partitioner.getNumPartitions());
    //
    //      // copy the split file into the FS, add to the DistributedCache:
    ////      AlphaPartitioner.setPartitionFile(jobConf, localSplitFile);
    //      AlphaPartitioner.setSplitCache(jobConf, localSplitFile);
    //      System.err.println("uploaded split file to FS and DistributedCache");
    //
    //      // Set job configs:
    //      jobConf.setInputFormat(TextInputFormat.class);
    //
    //      jobConf.setOutputFormat(TextOutputFormat.class);
    //      if (canonicalize) {
    //         jobConf.setMapperClass(CDXCanonicalizerMapClass.class);
    //      } else {
    //         jobConf.setMapperClass(CDXMapClass.class);
    //      }
    //      jobConf.setOutputKeyClass(Text.class);
    //      jobConf.setOutputValueClass(Text.class);
    //      jobConf.set("mapred.textoutputformat.separator", " ");
    //      jobConf.setPartitionerClass(AlphaPartitioner.class);

    int inputCount = 0;
    // Set job input:
    if (dereferenceInputs) {

        // SO SLOW... can't add one at a time...
        //         FileReader is2 = new FileReader(new File(inputPath));
        //         BufferedReader bis2 = new BufferedReader(is2);
        //         while (true) {
        //            String line = bis2.readLine();
        //            if (line == null) {
        //               break;
        //            }
        //            FileInputFormat.addInputPath(jobConf, new Path(line));
        //            inputCount++;
        //            System.err.println("Added path(" + inputCount + "): " + line);
        //         }

        // PASS 2:
        //         FileReader is2 = new FileReader(new File(inputPath));
        //         BufferedReader bis2 = new BufferedReader(is2);
        //         ArrayList<String> list = new ArrayList<String>();
        //         
        //         while (true) {
        //            String line = bis2.readLine();
        //            if (line == null) {
        //               break;
        //            }
        //            list.add(line);
        //            inputCount++;
        //         }
        //         Path arr[] = new Path[list.size()];
        //         for(int i=0; i < list.size(); i++) {
        //            arr[i] = new Path(list.get(i));
        //         }
        //         FileInputFormat.setInputPaths(jobConf, arr);

        // PASS 3:
        if (funkyInput) {
            jobConf.setMapperClass(FunkyDeReffingCDXCanonicalizerMapClass.class);
        } else {
            jobConf.setMapperClass(DeReffingCDXCanonicalizerMapClass.class);
        }
        FileInputFormat.setInputPaths(jobConf, new Path(inputPath));
        inputCount = 1;

    } else {
        FileInputFormat.setInputPaths(jobConf, new Path(inputPath));
        inputCount = 1;
    }

    // Set job output:
    FileOutputFormat.setOutputPath(jobConf, new Path(outputPath));

    if (compressOutput) {
        FileOutputFormat.setCompressOutput(jobConf, true);
        FileOutputFormat.setOutputCompressorClass(jobConf, GzipCodec.class);
    }

    //      System.out.println("Running on " + cluster.getTaskTrackers()
    //            + " nodes, processing " + inputCount + " files/directories"
    //            + " into " + outputPath + " with "
    //            + partitioner.getNumPartitions() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:org.terrier.utility.io.HadoopUtility.java

License:Mozilla Public License

/** Utility method to set JobOutputCompression if possible.
 * In general, I find that JobOutputCompression fails for
 * local job trackers, so this code checks the job tracker
 * location first./*from   ww w  . ja  v  a  2s  .c o m*/
 * @param conf JobConf of job.
 * @return true if JobOutputCompression was set.
 */
public static boolean setJobOutputCompression(JobConf conf) {
    if (!conf.get("mapred.job.tracker").equals("local")) {
        FileOutputFormat.setCompressOutput(conf, true);
        FileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
        return true;
    }
    return false;
}