Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputPath.

Prototype

public static void setOutputPath(Job job, Path outputDir)

Source Link

Document

Set the Path of the output directory for the map-reduce job.

Usage

From source file:com.kse.bigdata.main.Driver.java

License:Apache License

public static void main(String[] args) throws Exception {
    /**********************************************************************************
     **    Merge the source files into one.                                          **
    /**    Should change the directories of each file before executing the program   **
    ***********************************************************************************/
    //        String inputFileDirectory = "/media/bk/??/BigData_Term_Project/Debug";
    //        String resultFileDirectory = "/media/bk/??/BigData_Term_Project/debug.csv";
    //        File resultFile = new File(resultFileDirectory);
    //        if(!resultFile.exists())
    //            new SourceFileMerger(inputFileDirectory, resultFileDirectory).mergeFiles();

    /**********************************************************************************
     * Hadoop Operation.//w  w w .java 2 s  . co m
     * Befort Start, Check the Length of Sequence We Want to Predict.
     **********************************************************************************/

    Configuration conf = new Configuration();

    //Enable MapReduce intermediate compression as Snappy
    conf.setBoolean("mapred.compress.map.output", true);
    conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");

    //Enable Profiling
    //conf.setBoolean("mapred.task.profile", true);

    String testPath = null;
    String inputPath = null;
    String outputPath = null;

    int sampleSize = 1;
    ArrayList<String> results = new ArrayList<String>();

    for (int index = 0; index < args.length; index++) {

        /*
         * Mandatory command
         */
        //Extract input path string from command line.
        if (args[index].equals("-in"))
            inputPath = args[index + 1];

        //Extract output path string from command line.
        if (args[index].equals("-out"))
            outputPath = args[index + 1];

        //Extract test data path string from command line.
        if (args[index].equals("-test"))
            testPath = args[index + 1];

        /*
         * Optional command
         */
        //Extract a number of neighbors.
        if (args[index].equals("-nn"))
            conf.setInt(Reduce.NUMBER_OF_NEAREAST_NEIGHBOR, Integer.parseInt(args[index + 1]));

        //Whether job uses normalization or not.
        if (args[index].equals("-norm"))
            conf.setBoolean(Map.NORMALIZATION, true);

        //Extract the number of sample size to test.
        if (args[index].equals("-s"))
            sampleSize = Integer.valueOf(args[index + 1]);

        //Whether job uses mean or median
        //[Default : mean]
        if (args[index].equals("-med"))
            conf.setBoolean(Reduce.MEDIAN, true);
    }

    String outputFileName = "part-r-00000";
    SequenceSampler sampler = new SequenceSampler(testPath, sampleSize);
    LinkedList<Sequence> testSequences = sampler.getRandomSample();

    //        Test Sequence
    //        String testSeqString = "13.591-13.674-13.778-13.892-13.958-14.049-14.153-14.185-14.169-14.092-13.905-13.702-13.438-13.187-13.0-12.914-12.868-12.766-12.62-12.433-12.279-12.142-12.063-12.025-100";
    //        Sequence testSeq = new Sequence(testSeqString);
    //        LinkedList<Sequence> testSequences = new LinkedList<>();
    //        testSequences.add(testSeq);

    for (Sequence seq : testSequences) {

        /*
         ********************  Hadoop Launch ***********************
         */

        System.out.println(seq.getTailString());

        conf.set(Map.INPUT_SEQUENCE, seq.toString());

        Job job = new Job(conf);
        job.setJarByClass(Driver.class);
        job.setJobName("term-project-driver");

        job.setMapperClass(Map.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);

        //          Should think another way to implement the combiner class
        //          Current Implementation is not helpful to Job.
        //          job.setCombinerClass(Combiner.class);

        //Set 1 for number of reduce task for keeping 100 most neighbors in sorted set.
        job.setNumReduceTasks(1);
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        job.waitForCompletion(true);

        /*
         * if job finishes, get result of the job and store it in results(list).
         */
        try {
            FileSystem hdfs = FileSystem.get(new Configuration());
            BufferedReader fileReader = new BufferedReader(
                    new InputStreamReader(hdfs.open(new Path(outputPath + "/" + outputFileName))));

            String line;
            while ((line = fileReader.readLine()) != null) {
                results.add(seq.getSeqString() + " " + line);
            }

            fileReader.close();

            hdfs.delete(new Path(outputPath), true);
            hdfs.close();

        } catch (IOException e) {
            e.printStackTrace();
            System.exit(1);
        }
    }

    /*
     * if all jobs finish, store results of jobs to output/result.txt file.
     */
    String finalOutputPath = "output/result.csv";
    try {
        FileSystem hdfs = FileSystem.get(new Configuration());
        Path file = new Path(finalOutputPath);
        if (hdfs.exists(file)) {
            hdfs.delete(file, true);
        }

        OutputStream os = hdfs.create(file);
        PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(os, "UTF-8"));

        //CSV File Header
        printWriter.println("Actual,Predicted,MER,MAE");
        printWriter.flush();

        for (String result : results) {
            String[] tokens = result.split("\\s+");

            printWriter.println(tokens[0] + "," + tokens[1] + "," + tokens[2] + "," + tokens[3]);
            printWriter.flush();
        }

        printWriter.close();
        hdfs.close();
    } catch (IOException e) {
        e.printStackTrace();
        System.exit(1);
    }

}

From source file:com.kxen.han.projection.giraph.BspCase.java

License:Apache License

/**
 * Helper method to remove an old output directory if it exists,
 * and set the output path for any VertexOutputFormat that uses
 * FileOutputFormat./*w ww . j  a v  a 2  s  .co  m*/
 *
 * @param job Job to set the output path for
 * @param outputPath Path to output
 * @throws IOException
 */
public static void removeAndSetOutput(GiraphJob job, Path outputPath) throws IOException {
    FileUtils.deletePath(job.getConfiguration(), outputPath);
    FileOutputFormat.setOutputPath(job.getInternalJob(), outputPath);
}

From source file:com.kylinolap.job.hadoop.cube.CubeHFileJob.java

License:Apache License

public int run(String[] args) throws Exception {
    Options options = new Options();

    try {/*from w  w  w. jav a2  s .  c  o  m*/
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_HTABLE_NAME);
        parseOptions(options, args);

        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase();

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());

        CubeInstance cube = cubeMgr.getCube(cubeName);
        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));

        File JarFile = new File(KylinConfig.getInstanceFromEnv().getKylinJobJarPath());
        if (JarFile.exists()) {
            job.setJar(KylinConfig.getInstanceFromEnv().getKylinJobJarPath());
        } else {
            job.setJarByClass(this.getClass());
        }

        addInputDirs(getOptionValue(OPTION_INPUT_PATH), job);
        FileOutputFormat.setOutputPath(job, output);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapperClass(CubeHFileMapper.class);
        job.setReducerClass(KeyValueSortReducer.class);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        Configuration conf = HBaseConfiguration.create(getConf());
        // add metadata to distributed cache
        attachKylinPropsAndMetadata(cube, job.getConfiguration());

        String tableName = getOptionValue(OPTION_HTABLE_NAME).toUpperCase();
        HTable htable = new HTable(conf, tableName);

        //Automatic config !
        HFileOutputFormat.configureIncrementalLoad(job, htable);

        // set block replication to 3 for hfiles
        conf.set(DFSConfigKeys.DFS_REPLICATION_KEY, "3");

        this.deletePath(job.getConfiguration(), output);

        return waitForCompletion(job);
    } catch (Exception e) {
        printUsage(options);
        log.error(e.getLocalizedMessage(), e);
        return 2;
    }
}

From source file:com.kylinolap.job.hadoop.cube.CuboidJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {/*from  ww w.j  a  v  a 2s  . c  o m*/
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_NAME);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_NCUBOID_LEVEL);
        options.addOption(OPTION_INPUT_FORMAT);
        parseOptions(options, args);

        Path input = new Path(getOptionValue(OPTION_INPUT_PATH));
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase();
        int nCuboidLevel = Integer.parseInt(getOptionValue(OPTION_NCUBOID_LEVEL));
        String segmentName = getOptionValue(OPTION_SEGMENT_NAME);

        KylinConfig config = KylinConfig.getInstanceFromEnv();
        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        System.out.println("Starting: " + job.getJobName());
        FileInputFormat.setInputPaths(job, input);

        File jarFile = new File(config.getKylinJobJarPath());
        if (jarFile.exists()) {
            job.setJar(config.getKylinJobJarPath());
        } else {
            job.setJarByClass(this.getClass());
        }

        // Mapper
        if (this.mapperClass == null) {
            throw new Exception("Mapper class is not set!");
        }

        boolean isInputTextFormat = false;
        if (hasOption(OPTION_INPUT_FORMAT)
                && ("textinputformat".equalsIgnoreCase(getOptionValue(OPTION_INPUT_FORMAT)))) {
            isInputTextFormat = true;
        }

        if (isInputTextFormat) {
            job.setInputFormatClass(TextInputFormat.class);

        } else {
            job.setInputFormatClass(SequenceFileInputFormat.class);
        }
        job.setMapperClass(this.mapperClass);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setCombinerClass(CuboidReducer.class); // for base cuboid shuffle skew, some rowkey aggregates far more records than others

        // Reducer
        job.setReducerClass(CuboidReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileOutputFormat.setOutputPath(job, output);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_NAME, segmentName);
        // add metadata to distributed cache
        attachKylinPropsAndMetadata(cube, job.getConfiguration());

        setReduceTaskNum(job, config, cubeName, nCuboidLevel);

        this.deletePath(job.getConfiguration(), output);

        return waitForCompletion(job);
    } catch (Exception e) {
        printUsage(options);
        log.error(e.getLocalizedMessage(), e);
        return 2;
    }
}

From source file:com.kylinolap.job.hadoop.cube.FactDistinctColumnsJob.java

License:Apache License

private void setupReduceOutput(Path output) throws IOException {
    job.setReducerClass(FactDistinctColumnsReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.OUTPUT_PATH, output.toString());

    job.setNumReduceTasks(1);/*  www. j av  a 2 s  .co  m*/

    deletePath(job.getConfiguration(), output);
}

From source file:com.kylinolap.job.hadoop.cube.MergeCuboidJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {//from   w w w . j a  v a2s .  c o  m
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_NAME);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        parseOptions(options, args);

        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase();
        String segmentName = getOptionValue(OPTION_SEGMENT_NAME).toUpperCase();
        KylinConfig config = KylinConfig.getInstanceFromEnv();
        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        // CubeSegment cubeSeg = cubeMgr.findSegment(cube, segmentName);

        // start job
        String jobName = getOptionValue(OPTION_JOB_NAME);
        System.out.println("Starting: " + jobName);
        job = Job.getInstance(getConf(), jobName);

        // set job configuration - basic
        File JarFile = new File(config.getKylinJobJarPath());
        if (JarFile.exists()) {
            job.setJar(config.getKylinJobJarPath());
        } else {
            job.setJarByClass(this.getClass());
        }

        // setJobJar(job);
        addInputDirs(getOptionValue(OPTION_INPUT_PATH), job);

        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        FileOutputFormat.setOutputPath(job, output);

        // Mapper
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapperClass(MergeCuboidMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        // Reducer - only one
        job.setReducerClass(CuboidReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_NAME, segmentName);

        // add metadata to distributed cache
        attachKylinPropsAndMetadata(cube, job.getConfiguration());

        setReduceTaskNum(job, config, cubeName, 0);

        this.deletePath(job.getConfiguration(), output);

        return waitForCompletion(job);
    } catch (Exception e) {
        printUsage(options);
        log.error(e.getLocalizedMessage(), e);
        return 2;
    }
}

From source file:com.kylinolap.job.hadoop.cube.RangeKeyDistributionJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {//from   w w  w.j  a va  2  s  .co  m
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);

        parseOptions(options, args);

        // start job
        String jobName = getOptionValue(OPTION_JOB_NAME);
        job = Job.getInstance(getConf(), jobName);

        File JarFile = new File(KylinConfig.getInstanceFromEnv().getKylinJobJarPath());
        if (JarFile.exists()) {
            job.setJar(KylinConfig.getInstanceFromEnv().getKylinJobJarPath());
        } else {
            job.setJarByClass(this.getClass());
        }

        addInputDirs(getOptionValue(OPTION_INPUT_PATH), job);

        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        FileOutputFormat.setOutputPath(job, output);
        // job.getConfiguration().set("dfs.block.size", "67108864");

        // Mapper
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapperClass(RangeKeyDistributionMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        // Reducer - only one
        job.setReducerClass(RangeKeyDistributionReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        job.setNumReduceTasks(1);

        this.deletePath(job.getConfiguration(), output);

        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase();
        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeCapacity cubeCapacity = cube.getDescriptor().getCapacity();
        job.getConfiguration().set(BatchConstants.CUBE_CAPACITY, cubeCapacity.toString());

        return waitForCompletion(job);
    } catch (Exception e) {
        printUsage(options);
        log.error(e.getLocalizedMessage(), e);
        return 2;
    }
}

From source file:com.kylinolap.job.hadoop.cube.RowKeyDistributionCheckerJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {/*from w  ww . j a  va2s. com*/
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_JOB_NAME);
        options.addOption(rowKeyStatsFilePath);

        parseOptions(options, args);

        String statsFilePath = getOptionValue(rowKeyStatsFilePath);

        // start job
        String jobName = getOptionValue(OPTION_JOB_NAME);
        job = Job.getInstance(getConf(), jobName);

        job.setJarByClass(this.getClass());

        addInputDirs(getOptionValue(OPTION_INPUT_PATH), job);

        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        FileOutputFormat.setOutputPath(job, output);

        // Mapper
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapperClass(RowKeyDistributionCheckerMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        // Reducer - only one
        job.setReducerClass(RowKeyDistributionCheckerReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        job.setNumReduceTasks(1);

        job.getConfiguration().set("rowKeyStatsFilePath", statsFilePath);

        this.deletePath(job.getConfiguration(), output);

        return waitForCompletion(job);
    } catch (Exception e) {
        printUsage(options);
        log.error(e.getLocalizedMessage(), e);
        return 2;
    }
}

From source file:com.kylinolap.job.hadoop.invertedindex.IICreateHFileJob.java

License:Apache License

public int run(String[] args) throws Exception {
    Options options = new Options();

    try {/*www .  ja v  a2  s .c om*/
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_HTABLE_NAME);
        parseOptions(options, args);

        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));

        File JarFile = new File(KylinConfig.getInstanceFromEnv().getKylinJobJarPath());
        if (JarFile.exists()) {
            job.setJar(KylinConfig.getInstanceFromEnv().getKylinJobJarPath());
        } else {
            job.setJarByClass(this.getClass());
        }

        addInputDirs(getOptionValue(OPTION_INPUT_PATH), job);
        FileOutputFormat.setOutputPath(job, output);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapperClass(IICreateHFileMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);

        String tableName = getOptionValue(OPTION_HTABLE_NAME);
        HTable htable = new HTable(getConf(), tableName);
        HFileOutputFormat.configureIncrementalLoad(job, htable);

        this.deletePath(job.getConfiguration(), output);

        return waitForCompletion(job);
    } catch (Exception e) {
        printUsage(options);
        log.error(e.getLocalizedMessage(), e);
        return 2;
    }
}

From source file:com.kylinolap.job.hadoop.invertedindex.IIDistinctColumnsJob.java

License:Apache License

private void setupReduceOutput(Path output) throws IOException {
    job.setReducerClass(IIDistinctColumnsReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.OUTPUT_PATH, output.toString());

    job.setNumReduceTasks(1);// w w  w. ja va 2 s .  c  o  m

    deletePath(job.getConfiguration(), output);
}