Example usage for org.apache.hadoop.fs Path Path

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path Path.

Prototype

public Path(URI aUri)

Source Link

Document

Construct a path from a URI

Usage

From source file:inMapperStripes.java

License:Apache License

/**
 * Runs this tool./* ww w .j  a  v  a2 s  .  c  o  m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;
    int window = cmdline.hasOption(WINDOW) ? Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2;

    LOG.info("Tool: " + inMapperStripes.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - window: " + window);
    LOG.info(" - number of reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(inMapperStripes.class.getSimpleName());
    job.setJarByClass(inMapperStripes.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    job.getConfiguration().setInt("window", window);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(HMapSIW.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FloatWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:DateExample_Year.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);/*from w  w  w .j ava2 s.co  m*/
    }
    Job job = new Job(conf, "word count fs");
    job.setJarByClass(DateExample_Year.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(IsValidKeyFormat.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:RunText.java

License:Apache License

public static void main(String[] args) throws Exception {
    o = new Options();
    JCommander jc = null;/*from w w w .j  av a2  s. c  om*/
    try {
        jc = new JCommander(o, args);
        jc.setProgramName("./runText");
    } catch (ParameterException e) {
        System.out.println(e.getMessage());
        String[] valid = { "-p", "path", "-d", "delimiter", "v", "value", "-i", "index" };
        new JCommander(o, valid).usage();
        System.exit(-1);
    }
    if (o.help) {
        jc.usage();
        System.exit(0);
    }
    path = new Path(o.path);
    delim = o.delimiter.getBytes()[0];
    toFind = o.value;
    index = o.index;
    numThreads = o.threads;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    TextInputFormat format = new TextInputFormat();
    long len = fs.getFileStatus(path).getLen() / numThreads;

    List<Thread> threads = Lists.newArrayList();

    for (int i = 0; i < numThreads; i++) {
        FileSplit split = new FileSplit(path, i * len, len, new String[] { "" });
        threads.add(new Thread(new RunText(split, format)));
    }

    runningThreads = new AtomicInteger(numThreads);

    for (Thread t : threads) {
        t.start();
    }

    int prev = 0;
    int current;
    long t1 = System.nanoTime();
    long t2;
    while (runningThreads.get() > 0) {
        Thread.sleep(5000);
        current = totalCount.get();
        t2 = System.nanoTime();
        System.out.println(String.format("%f records/sec", (current - prev) * 1e9 / (t2 - t1)));
        t1 = t2;
        prev = current;
    }

    for (Thread t : threads) {
        t.join();
    }

    fs.close();
}

From source file:RecordExtracting.java

License:Apache License

/**
 * Runs this tool./*from w  w w .j  av a2  s. c  om*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    /* options.addOption(OptionBuilder.withArgName("path").hasArg()
         .withDescription("input path").create(INPUT));
     options.addOption(OptionBuilder.withArgName("path").hasArg()
         .withDescription("output path").create(OUTPUT));
     options.addOption(OptionBuilder.withArgName("num").hasArg()
         .withDescription("number of nodes").create(NUM_NODES));
     //parsing more than 1 integer later;*/
    options.addOption(
            OptionBuilder.withArgName("src").hasArg().withDescription("spamming users").create(SOURCES));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    /* if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)||!cmdline.hasOption(SOURCES)) {
       System.out.println("args: " + Arrays.toString(args));
       HelpFormatter formatter = new HelpFormatter();
       formatter.setWidth(120);
       formatter.printHelp(this.getClass().getName(), options);
       ToolRunner.printGenericCommandUsage(System.out);
       return -1;
     }*/

    String inputPath = "xzzqskfinal/reviewsNew.txt";//cmdline.getOptionValue(INPUT);
    String outputPath = "xzzqskfinal/SpammingRecord";//cmdline.getOptionValue(OUTPUT);
    //int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES));
    //Change to array later
    String src = cmdline.getOptionValue(SOURCES);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool name: " + RecordExtracting.class.getSimpleName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    //LOG.info(" - numNodes: " + n);

    Configuration conf = getConf();

    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    //conf.setInt(NODE_CNT_FIELD, n);
    //more to be set later;
    conf.set(NODE_SRC, src);
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    Job job = Job.getInstance(conf);
    job.setJobName(RatingSpamming.class.getSimpleName());
    job.setJarByClass(RecordExtracting.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(TextInputFormat.class);
    //job.setOutputFormatClass(TextOutputFormat.class);
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(PairOfStrings.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(PairOfStrings.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    job.waitForCompletion(true);
    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:DataHBase.java

License:Open Source License

public void run(HashMap<String, String> config) throws Exception {

    //clean the former output if it exists
    Path p = new Path(config.get("hdfs_output_dir"));
    FileSystem fs = FileSystem.get(new Configuration());
    if (fs.exists(p)) {
        fs.delete(p, true);/*from  w  ww .  ja  v  a2 s .  c om*/
    }

    String junction = config.get("what_to_find"); // the name of the junction
    String date1 = config.get("date1");
    String date2 = config.get("date2");
    //date1 and date2 can be of a format YYYY-MM-DD
    if (date1.length() == 10)
        date1 = date1 + " 00:00:00";
    if (date2.length() == 10)
        date2 = date2 + " 23:59:59";
    System.out.println("Looking for data of " + junction + ": " + date1 + " - " + date2);

    //create timestamps (considering time zone!) to limit data
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    sdf.setTimeZone(TimeZone.getDefault());
    Long time1 = sdf.parse(date1).getTime();
    Long time2 = sdf.parse(date2).getTime();

    //run a job
    Configuration conf = HBaseConfiguration.create();
    conf.set("mapreduce.output.textoutputformat.separator", ","); //set comma as a delimiter

    Job job = new Job(conf, "Retrieve data from hbase");
    job.setJarByClass(DataHBase.class);

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    scan.setMaxVersions(1);
    scan.setTimeRange(time1, time2); //take a day we are interested in
    //set a filter for a junction name
    if (!junction.equals("")) {
        SingleColumnValueFilter filter = new SingleColumnValueFilter(Bytes.toBytes("data"),
                Bytes.toBytes("location_name"), CompareOp.EQUAL, Bytes.toBytes(junction));
        scan.setFilter(filter);
    }
    //add the specific columns to the output to limit the amount of data
    scan.addFamily(Bytes.toBytes("data"));

    TableMapReduceUtil.initTableMapperJob(config.get("hbase_table"), // input HBase table name
            scan, // Scan instance to control CF and attribute selection
            TableMap.class, // mapper
            Text.class, // mapper output key
            Text.class, // mapper output value
            job);

    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, new Path(config.get("hdfs_output_dir")));

    job.waitForCompletion(true);

}

From source file:RunPageRankBasic.java

License:Apache License

private float phase1(int i, int j, String basePath, int numNodes, boolean useCombiner,
        boolean useInMapperCombiner) throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName("PageRank:Basic:iteration" + j + ":Phase1");
    job.setJarByClass(RunPageRankBasic.class);

    String in = basePath + "/iter" + formatter.format(i);
    String out = basePath + "/iter" + formatter.format(j) + "t";
    String outm = out + "-mass";

    // We need to actually count the number of part files to get the number of partitions (because
    // the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;// ww  w.  j  ava 2  s  . c om
    }

    LOG.info("PageRank: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + numNodes);
    LOG.info(" - useCombiner: " + useCombiner);
    LOG.info(" - useInmapCombiner: " + useInMapperCombiner);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    job.getConfiguration().setInt("NodeCount", numNodes);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    //job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");
    job.getConfiguration().set("PageRankMassPath", outm);

    job.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    job.setMapperClass(useInMapperCombiner ? MapWithInMapperCombiningClass.class : MapClass.class);

    if (useCombiner) {
        job.setCombinerClass(CombineClass.class);
    }

    job.setReducerClass(ReduceClass.class);

    FileSystem.get(getConf()).delete(new Path(out), true);
    FileSystem.get(getConf()).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    float mass = Float.NEGATIVE_INFINITY;
    FileSystem fs = FileSystem.get(getConf());
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:RunPageRankBasic.java

License:Apache License

private void phase2(int i, int j, float missing, String basePath, int numNodes) throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName("PageRank:Basic:iteration" + j + ":Phase2");
    job.setJarByClass(RunPageRankBasic.class);

    LOG.info("missing PageRank mass: " + missing);
    LOG.info("number of nodes: " + numNodes);

    String in = basePath + "/iter" + formatter.format(j) + "t";
    String out = basePath + "/iter" + formatter.format(j);

    LOG.info("PageRank: iteration " + j + ": Phase2");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);

    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().setFloat("MissingMass", (float) missing);
    job.getConfiguration().setInt("NodeCount", numNodes);

    job.setNumReduceTasks(0);// www. jav a  2 s. c  o m

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    job.setMapperClass(MapPageRankMassDistributionClass.class);

    FileSystem.get(getConf()).delete(new Path(out), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
}

From source file:DijikstraAlgo.java

License:GNU General Public License

public static void run(String[] args) throws Exception {
    IN = "hdfs://10.8.3.161:9000/user/sagar/input/";
    OUT = "hdfs://10.8.3.161:9000/user/sagar/output/";
    String input = IN;//from   w  ww. j  a v  a2  s .co m
    String output = OUT + System.nanoTime();
    String MAX_SPLIT_SIZE = args[0];
    boolean isdone = false;

    // Reiteration again and again till the convergence
    while (isdone == false) {
        JobConf conf = new JobConf(DijikstraAlgo.class);
        conf.setJobName("Dijikstra");
        // conf.set("mapred.max.split.size", MAX_SPLIT_SIZE);
        conf.setOutputKeyClass(LongWritable.class);
        conf.setOutputValueClass(Text.class);
        conf.setMapperClass(Map.class);
        conf.setReducerClass(Reduce.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path(input));
        FileOutputFormat.setOutputPath(conf, new Path(output));

        JobClient.runJob(conf);

        input = output + "/part-00000";
        isdone = true;// set the job to NOT run again!
        Path ofile = new Path(input);
        FileSystem fs = FileSystem.get(new URI("hdfs://10.8.3.165:9000"), conf);
        //FileSystem fs = FileSystem.get(new Configuration());
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(ofile)));
        HashMap<Integer, Integer> imap = new HashMap<Integer, Integer>();
        String line = br.readLine();
        // Read the current output file and put it into HashMap
        while (line != null) {
            String[] sp = line.split("\t| ");
            int node = Integer.parseInt(sp[0]);
            int distance = Integer.parseInt(sp[1]);
            imap.put(node, distance);
            line = br.readLine();
        }
        br.close();

        // Check for convergence condition if any node is still left then
        // continue else stop
        Iterator<Integer> itr = imap.keySet().iterator();
        while (itr.hasNext()) {
            int key = itr.next();
            int value = imap.get(key);
            if (value >= 125) {
                isdone = false;
            }
        }
        input = output;
        output = OUT + System.nanoTime();
    }
}

From source file:lab2_2.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    FileSystem.get(conf).delete(new Path(args[1]), true);

    Job job = Job.getInstance(conf, "drive time lab 2.1");
    job.setJarByClass(lab2_1.class);
    job.setMapperClass(PartitioningMapper.class);
    job.setPartitionerClass(TypePartitioner.class);
    job.setReducerClass(IdentityReducer.class);
    job.setNumReduceTasks(6);//  ww w  . j ava2  s. com

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:TestBytesBloomFilter.java

License:Apache License

public void testSetSanity() throws IOException {
    FileSystem local = FileSystem.getLocal(new Configuration());

    BytesBloomFilter set = new BytesBloomFilter(1000000, 4);
    byte[] arr1 = new byte[] { 1, 2, 3, 4, 5, 6, 7 };
    byte[] arr2 = new byte[] { 11, 12, 5, -2 };
    byte[] arr3 = new byte[] { 3, 4, 5 };
    set.add(arr1);/* w ww  .  ja va2s.co m*/
    set.add(arr2);

    for (byte i = 0; i < (byte) 125; i++) {
        set.add(new byte[] { i });
    }

    assertTrue(set.mayContain(arr1));
    assertTrue(set.mayContain(arr2));

    for (byte i = 0; i < (byte) 125; i++) {
        assertTrue(set.mayContain(new byte[] { i }));
    }

    //technically this could be an invalid statement, but the probability is low and this is a sanity check
    assertFalse(set.mayContain(arr3));

    //now test that we can write and read from file just fine
    local.delete(new Path("/tmp/filter-test.bloomfilter"), false);
    DataOutputStream os = new DataOutputStream(new FileOutputStream("/tmp/filter-test.bloomfilter"));
    set.write(os);
    os.close();

    BytesBloomFilter set2 = new BytesBloomFilter();
    DataInputStream is = new DataInputStream(new FileInputStream("/tmp/filter-test.bloomfilter"));
    set2.readFields(is);

    assertTrue(set2.mayContain(arr1));
    assertTrue(set2.mayContain(arr2));

    for (byte i = 0; i < (byte) 125; i++) {
        assertTrue(set2.mayContain(new byte[] { i }));
    }

    //technically this could be an invalid statement, but the probability is low and this is a sanity check
    assertFalse(set2.mayContain(arr3));

}