Example usage for org.apache.hadoop.fs Path Path

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path Path.

Prototype

public Path(URI aUri)

Source Link

Document

Construct a path from a URI

Usage

From source file:PairsPMI_M.java

License:Apache License

/**
 * Runs this tool./* w ww  .jav  a  2  s  .c o  m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    // First MapReduce Job

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool name: " + PairsPMI_M.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - tmp path: " + outputPath + "/tmp");
    LOG.info(" - num reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(PairsPMI_M.class.getSimpleName());
    job.setJarByClass(PairsPMI_M.class);

    // Delete the tmp directory if it exists already
    Path tmpDir = new Path("tmp_wj");
    FileSystem.get(getConf()).delete(tmpDir, true);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path("tmp_wj"));

    job.setMapOutputKeyClass(PairOfStrings.class);
    job.setMapOutputValueClass(FloatWritable.class);
    job.setOutputKeyClass(PairOfStrings.class);
    job.setOutputValueClass(FloatWritable.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    //    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    double time1 = (System.currentTimeMillis() - startTime) / 1000.0;
    System.out.println("Job Finished in " + time1 + " seconds");
    numRecords = job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS")
            .getValue();

    /*
     *  Second MapReduce Job
     */

    LOG.info("Tool name: " + PairsPMI_M.class.getSimpleName());
    LOG.info("second stage of MapReduce");
    LOG.info(" - input from tmp path: " + outputPath + "/tmp_wj");
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - num reducers: " + reduceTasks);

    // set the global variable
    Configuration conf = getConf();
    conf.setLong("numRec", numRecords);

    job = Job.getInstance(getConf());
    job.setJobName(PairsPMI_M.class.getSimpleName());
    job.setJarByClass(PairsPMI_M.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    job.setNumReduceTasks(reduceTasks);
    FileInputFormat.setInputPaths(job, new Path("tmp_wj/part*"));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setMapOutputKeyClass(PairOfStrings.class);
    job.setMapOutputValueClass(FloatWritable.class);
    // job.setOutputKeyClass(PairOfStrings.class);
    job.setOutputKeyClass(Text.class);

    job.setOutputValueClass(FloatWritable.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    //   job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapperSecond.class);
    //    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducerSecond.class);
    job.setPartitionerClass(MyPartitioner.class);

    startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    double time2 = (System.currentTimeMillis() - startTime) / 1000.0;
    System.out.println("Second job finished in " + time2 + " seconds");
    System.out.println("Total time: " + (time1 + time2) + " seconds");

    return 0;
}

From source file:RepackWikipedia.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from  www . jav a 2s  .  co  m*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file")
            .create(MAPPING_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("block|record|none").hasArg()
            .withDescription("compression type").create(COMPRESSION_TYPE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg().withDescription("two-letter language code")
            .create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION);
    String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION);

    if (!"block".equals(compressionType) && !"record".equals(compressionType)
            && !"none".equals(compressionType)) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    // this is the default block size
    int blocksize = 1000000;

    //Job job = Job.getInstance(getConf());
    JobConf conf = new JobConf(RepackWikipedia.class);
    conf.setJarByClass(RepackWikipedia.class);
    conf.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language));

    conf.set(DOCNO_MAPPING_FIELD, mappingFile);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - docno mapping data file: " + mappingFile);
    LOG.info(" - compression type: " + compressionType);
    LOG.info(" - language: " + language);

    if ("block".equals(compressionType)) {
        LOG.info(" - block size: " + blocksize);
    }

    conf.setNumReduceTasks(0);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    if ("none".equals(compressionType)) {
        FileOutputFormat.setCompressOutput(conf, false);
    } else {
        FileOutputFormat.setCompressOutput(conf, true);

        if ("record".equals(compressionType)) {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
            conf.setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    if (language != null) {
        conf.set("wiki.language", language);
    }

    conf.setInputFormat(WikipediaPageInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(WikipediaPage.class);

    conf.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    //job.waitForCompletion(true);
    JobClient.runJob(conf);

    return 0;
}

From source file:InputDataUsage.java

License:Apache License

public static void main(String[] argv) throws IOException {
    Configuration conf = new Configuration();
    Path in = new Path("tfidf-vectors/part-r-00000");
    HashMap<Text, VectorWritable> doc = InputData.vectorizedTextReader(conf, in);

    for (java.util.Map.Entry<Text, VectorWritable> entry : doc.entrySet()) {
        System.out.println("Document ID: " + entry.getKey());
        System.out.println("Vector: " + entry.getValue());
        System.out.println("Dimensions: " + entry.getValue().get().size());
        break;// ww  w  . j a v  a  2s.  co m
    }
}

From source file:IndexWords.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        return -1;
    }//from   ww w .  j a  va 2s. com

    checkWords = new String[args.length - 2];

    int numIter = 5;

    Path input = new Path(args[0]);

    for (int i = 0; i < numIter; i++) {
        JobConf conf = new JobConf(getConf(), IndexWords.class);
        conf.setJobName("indexwords");

        conf.setInputFormat(KeyValueTextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        conf.setMapperClass(MapClass.class);
        conf.setReducerClass(Reduce.class);

        FileInputFormat.setInputPaths(conf, input);
        FileOutputFormat.setOutputPath(conf, new Path(args[1] + Integer.toString(i)));

        RunningJob rj = JobClient.runJob(conf);
        input = new Path(args[1] + Integer.toString(i));
        double resVal = rj.getCounters().getCounter(RecordCounters.RESIDUAL_COUNTER) * 1.0 / 10000;
        System.out.println(N + " " + (resVal / (1.0 * N)));
        if (resVal / (1.0 * N) < 0.001)
            break;
    }

    return 0;
}

From source file:PopularURLs.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "popularurls");
    job.setJarByClass(PopularURLs.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.waitForCompletion(true);/*from w  ww.  j a v  a  2  s  . com*/
}

From source file:ClimateData.java

License:Open Source License

public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(ClimateData.class);
    conf.setJobName("climatedata");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//  w  w  w . j  a  va  2s.co m
}

From source file:TestComplexColumnInSchema.java

License:Apache License

@BeforeClass
public static void setup() throws IOException {
    Configuration conf = new Configuration();

    footer = ParquetFileReader.readFooter(conf, new Path(path));
}

From source file:LobFileStressTest.java

License:Apache License

private Path getPath(boolean compress) {
    if (compress) {
        return new Path("compressed.lob");
    } else {/* w w  w.ja va 2 s .c om*/
        return new Path("integers.lob");
    }
}

From source file:LobFileStressTest.java

License:Apache License

private Path getBigFilePath(boolean compress) {
    if (compress) {
        return new Path("big-compressed.lob");
    } else {/*from   w  ww .  ja v  a2 s. co m*/
        return new Path("big.lob");
    }
}

From source file:FormatStorage2ColumnStorageMR.java

License:Open Source License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {

    if (args.length != 2) {
        System.out.println("FormatStorage2ColumnStorageMR <input> <output>");
        System.exit(-1);//ww w  .j a va  2 s  . c o m
    }

    JobConf conf = new JobConf(FormatStorageMR.class);

    conf.setJobName("FormatStorage2ColumnStorageMR");

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(4);

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Unit.Record.class);

    conf.setMapperClass(FormatStorageMapper.class);
    conf.setReducerClass(ColumnStorageReducer.class);

    conf.setInputFormat(FormatStorageInputFormat.class);
    conf.set("mapred.output.compress", "flase");

    Head head = new Head();
    initHead(head);

    head.toJobConf(conf);

    FileInputFormat.setInputPaths(conf, args[0]);
    Path outputPath = new Path(args[1]);
    FileOutputFormat.setOutputPath(conf, outputPath);

    FileSystem fs = outputPath.getFileSystem(conf);
    fs.delete(outputPath, true);

    JobClient jc = new JobClient(conf);
    RunningJob rj = null;
    rj = jc.submitJob(conf);

    String lastReport = "";
    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss,SSS");
    long reportTime = System.currentTimeMillis();
    long maxReportInterval = 3 * 1000;
    while (!rj.isComplete()) {
        try {
            Thread.sleep(1000);
        } catch (InterruptedException e) {
        }

        int mapProgress = Math.round(rj.mapProgress() * 100);
        int reduceProgress = Math.round(rj.reduceProgress() * 100);

        String report = " map = " + mapProgress + "%,  reduce = " + reduceProgress + "%";

        if (!report.equals(lastReport) || System.currentTimeMillis() >= reportTime + maxReportInterval) {

            String output = dateFormat.format(Calendar.getInstance().getTime()) + report;
            System.out.println(output);
            lastReport = report;
            reportTime = System.currentTimeMillis();
        }
    }

    System.exit(0);

}