Example usage for org.apache.hadoop.fs Path Path

List of usage examples for org.apache.hadoop.fs Path Path

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path Path.

Prototype

public Path(URI aUri) 

Source Link

Document

Construct a path from a URI

Usage

From source file:inMapperStripes.java

License:Apache License

/**
 * Runs this tool./* ww w .j  a  v  a2 s  .  c  o  m*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;
    int window = cmdline.hasOption(WINDOW) ? Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2;

    LOG.info("Tool: " + inMapperStripes.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - window: " + window);
    LOG.info(" - number of reducers: " + reduceTasks);

    Job job = Job.getInstance(getConf());
    job.setJobName(inMapperStripes.class.getSimpleName());
    job.setJarByClass(inMapperStripes.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    job.getConfiguration().setInt("window", window);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(HMapSIW.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FloatWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:DateExample_Year.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);/*from w  w  w .j ava2 s.co  m*/
    }
    Job job = new Job(conf, "word count fs");
    job.setJarByClass(DateExample_Year.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(IsValidKeyFormat.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:RunText.java

License:Apache License

public static void main(String[] args) throws Exception {
    o = new Options();
    JCommander jc = null;/*from w w w .j  av a2  s. c  om*/
    try {
        jc = new JCommander(o, args);
        jc.setProgramName("./runText");
    } catch (ParameterException e) {
        System.out.println(e.getMessage());
        String[] valid = { "-p", "path", "-d", "delimiter", "v", "value", "-i", "index" };
        new JCommander(o, valid).usage();
        System.exit(-1);
    }
    if (o.help) {
        jc.usage();
        System.exit(0);
    }
    path = new Path(o.path);
    delim = o.delimiter.getBytes()[0];
    toFind = o.value;
    index = o.index;
    numThreads = o.threads;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    TextInputFormat format = new TextInputFormat();
    long len = fs.getFileStatus(path).getLen() / numThreads;

    List<Thread> threads = Lists.newArrayList();

    for (int i = 0; i < numThreads; i++) {
        FileSplit split = new FileSplit(path, i * len, len, new String[] { "" });
        threads.add(new Thread(new RunText(split, format)));
    }

    runningThreads = new AtomicInteger(numThreads);

    for (Thread t : threads) {
        t.start();
    }

    int prev = 0;
    int current;
    long t1 = System.nanoTime();
    long t2;
    while (runningThreads.get() > 0) {
        Thread.sleep(5000);
        current = totalCount.get();
        t2 = System.nanoTime();
        System.out.println(String.format("%f records/sec", (current - prev) * 1e9 / (t2 - t1)));
        t1 = t2;
        prev = current;
    }

    for (Thread t : threads) {
        t.join();
    }

    fs.close();
}

From source file:RecordExtracting.java

License:Apache License

/**
 * Runs this tool./*from w  w w .j  av a2  s. c  om*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    /* options.addOption(OptionBuilder.withArgName("path").hasArg()
         .withDescription("input path").create(INPUT));
     options.addOption(OptionBuilder.withArgName("path").hasArg()
         .withDescription("output path").create(OUTPUT));
     options.addOption(OptionBuilder.withArgName("num").hasArg()
         .withDescription("number of nodes").create(NUM_NODES));
     //parsing more than 1 integer later;*/
    options.addOption(
            OptionBuilder.withArgName("src").hasArg().withDescription("spamming users").create(SOURCES));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    /* if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)||!cmdline.hasOption(SOURCES)) {
       System.out.println("args: " + Arrays.toString(args));
       HelpFormatter formatter = new HelpFormatter();
       formatter.setWidth(120);
       formatter.printHelp(this.getClass().getName(), options);
       ToolRunner.printGenericCommandUsage(System.out);
       return -1;
     }*/

    String inputPath = "xzzqskfinal/reviewsNew.txt";//cmdline.getOptionValue(INPUT);
    String outputPath = "xzzqskfinal/SpammingRecord";//cmdline.getOptionValue(OUTPUT);
    //int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES));
    //Change to array later
    String src = cmdline.getOptionValue(SOURCES);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool name: " + RecordExtracting.class.getSimpleName());
    LOG.info(" - inputDir: " + inputPath);
    LOG.info(" - outputDir: " + outputPath);
    //LOG.info(" - numNodes: " + n);

    Configuration conf = getConf();

    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    //conf.setInt(NODE_CNT_FIELD, n);
    //more to be set later;
    conf.set(NODE_SRC, src);
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    Job job = Job.getInstance(conf);
    job.setJobName(RatingSpamming.class.getSimpleName());
    job.setJarByClass(RecordExtracting.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(TextInputFormat.class);
    //job.setOutputFormatClass(TextOutputFormat.class);
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(PairOfStrings.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(PairOfStrings.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    job.waitForCompletion(true);
    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:DataHBase.java

License:Open Source License

public void run(HashMap<String, String> config) throws Exception {

    //clean the former output if it exists
    Path p = new Path(config.get("hdfs_output_dir"));
    FileSystem fs = FileSystem.get(new Configuration());
    if (fs.exists(p)) {
        fs.delete(p, true);/*from  w  ww .  ja  v  a2 s .  c om*/
    }

    String junction = config.get("what_to_find"); // the name of the junction
    String date1 = config.get("date1");
    String date2 = config.get("date2");
    //date1 and date2 can be of a format YYYY-MM-DD
    if (date1.length() == 10)
        date1 = date1 + " 00:00:00";
    if (date2.length() == 10)
        date2 = date2 + " 23:59:59";
    System.out.println("Looking for data of " + junction + ": " + date1 + " - " + date2);

    //create timestamps (considering time zone!) to limit data
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    sdf.setTimeZone(TimeZone.getDefault());
    Long time1 = sdf.parse(date1).getTime();
    Long time2 = sdf.parse(date2).getTime();

    //run a job
    Configuration conf = HBaseConfiguration.create();
    conf.set("mapreduce.output.textoutputformat.separator", ","); //set comma as a delimiter

    Job job = new Job(conf, "Retrieve data from hbase");
    job.setJarByClass(DataHBase.class);

    Scan scan = new Scan();
    scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
    scan.setCacheBlocks(false); // don't set to true for MR jobs
    scan.setMaxVersions(1);
    scan.setTimeRange(time1, time2); //take a day we are interested in
    //set a filter for a junction name
    if (!junction.equals("")) {
        SingleColumnValueFilter filter = new SingleColumnValueFilter(Bytes.toBytes("data"),
                Bytes.toBytes("location_name"), CompareOp.EQUAL, Bytes.toBytes(junction));
        scan.setFilter(filter);
    }
    //add the specific columns to the output to limit the amount of data
    scan.addFamily(Bytes.toBytes("data"));

    TableMapReduceUtil.initTableMapperJob(config.get("hbase_table"), // input HBase table name
            scan, // Scan instance to control CF and attribute selection
            TableMap.class, // mapper
            Text.class, // mapper output key
            Text.class, // mapper output value
            job);

    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, new Path(config.get("hdfs_output_dir")));

    job.waitForCompletion(true);

}

From source file:RunPageRankBasic.java

License:Apache License

private float phase1(int i, int j, String basePath, int numNodes, boolean useCombiner,
        boolean useInMapperCombiner) throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName("PageRank:Basic:iteration" + j + ":Phase1");
    job.setJarByClass(RunPageRankBasic.class);

    String in = basePath + "/iter" + formatter.format(i);
    String out = basePath + "/iter" + formatter.format(j) + "t";
    String outm = out + "-mass";

    // We need to actually count the number of part files to get the number of partitions (because
    // the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;// ww  w.  j  ava 2  s  . c om
    }

    LOG.info("PageRank: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + numNodes);
    LOG.info(" - useCombiner: " + useCombiner);
    LOG.info(" - useInmapCombiner: " + useInMapperCombiner);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    job.getConfiguration().setInt("NodeCount", numNodes);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    //job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m");
    job.getConfiguration().set("PageRankMassPath", outm);

    job.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    job.setMapperClass(useInMapperCombiner ? MapWithInMapperCombiningClass.class : MapClass.class);

    if (useCombiner) {
        job.setCombinerClass(CombineClass.class);
    }

    job.setReducerClass(ReduceClass.class);

    FileSystem.get(getConf()).delete(new Path(out), true);
    FileSystem.get(getConf()).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    float mass = Float.NEGATIVE_INFINITY;
    FileSystem fs = FileSystem.get(getConf());
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:RunPageRankBasic.java

License:Apache License

private void phase2(int i, int j, float missing, String basePath, int numNodes) throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName("PageRank:Basic:iteration" + j + ":Phase2");
    job.setJarByClass(RunPageRankBasic.class);

    LOG.info("missing PageRank mass: " + missing);
    LOG.info("number of nodes: " + numNodes);

    String in = basePath + "/iter" + formatter.format(j) + "t";
    String out = basePath + "/iter" + formatter.format(j);

    LOG.info("PageRank: iteration " + j + ": Phase2");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);

    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().setFloat("MissingMass", (float) missing);
    job.getConfiguration().setInt("NodeCount", numNodes);

    job.setNumReduceTasks(0);// www. jav a  2 s. c  o m

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    job.setMapperClass(MapPageRankMassDistributionClass.class);

    FileSystem.get(getConf()).delete(new Path(out), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
}

From source file:DijikstraAlgo.java

License:GNU General Public License

public static void run(String[] args) throws Exception {
    IN = "hdfs://10.8.3.161:9000/user/sagar/input/";
    OUT = "hdfs://10.8.3.161:9000/user/sagar/output/";
    String input = IN;//from   w  ww. j  a v  a2  s .co m
    String output = OUT + System.nanoTime();
    String MAX_SPLIT_SIZE = args[0];
    boolean isdone = false;

    // Reiteration again and again till the convergence
    while (isdone == false) {
        JobConf conf = new JobConf(DijikstraAlgo.class);
        conf.setJobName("Dijikstra");
        // conf.set("mapred.max.split.size", MAX_SPLIT_SIZE);
        conf.setOutputKeyClass(LongWritable.class);
        conf.setOutputValueClass(Text.class);
        conf.setMapperClass(Map.class);
        conf.setReducerClass(Reduce.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path(input));
        FileOutputFormat.setOutputPath(conf, new Path(output));

        JobClient.runJob(conf);

        input = output + "/part-00000";
        isdone = true;// set the job to NOT run again!
        Path ofile = new Path(input);
        FileSystem fs = FileSystem.get(new URI("hdfs://10.8.3.165:9000"), conf);
        //FileSystem fs = FileSystem.get(new Configuration());
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(ofile)));
        HashMap<Integer, Integer> imap = new HashMap<Integer, Integer>();
        String line = br.readLine();
        // Read the current output file and put it into HashMap
        while (line != null) {
            String[] sp = line.split("\t| ");
            int node = Integer.parseInt(sp[0]);
            int distance = Integer.parseInt(sp[1]);
            imap.put(node, distance);
            line = br.readLine();
        }
        br.close();

        // Check for convergence condition if any node is still left then
        // continue else stop
        Iterator<Integer> itr = imap.keySet().iterator();
        while (itr.hasNext()) {
            int key = itr.next();
            int value = imap.get(key);
            if (value >= 125) {
                isdone = false;
            }
        }
        input = output;
        output = OUT + System.nanoTime();
    }
}

From source file:lab2_2.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    FileSystem.get(conf).delete(new Path(args[1]), true);

    Job job = Job.getInstance(conf, "drive time lab 2.1");
    job.setJarByClass(lab2_1.class);
    job.setMapperClass(PartitioningMapper.class);
    job.setPartitionerClass(TypePartitioner.class);
    job.setReducerClass(IdentityReducer.class);
    job.setNumReduceTasks(6);//  ww w  . j ava2  s. com

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:TestBytesBloomFilter.java

License:Apache License

public void testSetSanity() throws IOException {
    FileSystem local = FileSystem.getLocal(new Configuration());

    BytesBloomFilter set = new BytesBloomFilter(1000000, 4);
    byte[] arr1 = new byte[] { 1, 2, 3, 4, 5, 6, 7 };
    byte[] arr2 = new byte[] { 11, 12, 5, -2 };
    byte[] arr3 = new byte[] { 3, 4, 5 };
    set.add(arr1);/* w ww  .  ja va2s.co m*/
    set.add(arr2);

    for (byte i = 0; i < (byte) 125; i++) {
        set.add(new byte[] { i });
    }

    assertTrue(set.mayContain(arr1));
    assertTrue(set.mayContain(arr2));

    for (byte i = 0; i < (byte) 125; i++) {
        assertTrue(set.mayContain(new byte[] { i }));
    }

    //technically this could be an invalid statement, but the probability is low and this is a sanity check
    assertFalse(set.mayContain(arr3));

    //now test that we can write and read from file just fine
    local.delete(new Path("/tmp/filter-test.bloomfilter"), false);
    DataOutputStream os = new DataOutputStream(new FileOutputStream("/tmp/filter-test.bloomfilter"));
    set.write(os);
    os.close();

    BytesBloomFilter set2 = new BytesBloomFilter();
    DataInputStream is = new DataInputStream(new FileInputStream("/tmp/filter-test.bloomfilter"));
    set2.readFields(is);

    assertTrue(set2.mayContain(arr1));
    assertTrue(set2.mayContain(arr2));

    for (byte i = 0; i < (byte) 125; i++) {
        assertTrue(set2.mayContain(new byte[] { i }));
    }

    //technically this could be an invalid statement, but the probability is low and this is a sanity check
    assertFalse(set2.mayContain(arr3));

}