Example usage for org.apache.hadoop.fs Path Path

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path Path.

Prototype

public Path(URI aUri)

Source Link

Document

Construct a path from a URI

Usage

From source file:BwaInterpreter.java

License:Open Source License

private void combineOutputSamFiles(String outputHdfsDir, List<String> returnedValues) {
    try {/*from w ww .j  a va  2  s. c  om*/
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        Path finalHdfsOutputFile = new Path(outputHdfsDir + "/FullOutput.sam");
        FSDataOutputStream outputFinalStream = fs.create(finalHdfsOutputFile, true);

        // We iterate over the resulting files in HDFS and agregate them into only one file.
        for (int i = 0; i < returnedValues.size(); i++) {
            LOG.info("JMAbuin:: SparkBWA :: Returned file ::" + returnedValues.get(i));
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(fs.open(new Path(returnedValues.get(i)))));

            String line;
            line = br.readLine();

            while (line != null) {
                if (i == 0 || !line.startsWith("@")) {
                    //outputFinalStream.writeBytes(line+"\n");
                    outputFinalStream.write((line + "\n").getBytes());
                }

                line = br.readLine();
            }
            br.close();

            fs.delete(new Path(returnedValues.get(i)), true);
        }

        outputFinalStream.close();
        fs.close();
    } catch (IOException e) {
        e.printStackTrace();
        LOG.error(e.toString());
    }
}

From source file:BwaInterpreter.java

License:Open Source License

/**
 * Runs BWA with the specified options//from  w  w w  .  jav a2s.c  om
 * @brief This function runs BWA with the input data selected and with the options also selected by the user.
 */
public void RunBwa() {
    LOG.info("JMAbuin:: Starting BWA");
    Bwa bwa = new Bwa(this.options);

    List<String> returnedValues;
    if (bwa.isPairedReads()) {
        JavaRDD<Tuple2<String, String>> readsRDD = handlePairedReadsSorting();
        returnedValues = MapPairedBwa(bwa, readsRDD);
    } else {
        JavaRDD<String> readsRDD = handleSingleReadsSorting();
        returnedValues = MapSingleBwa(bwa, readsRDD);
    }

    LOG.info("BwaRDD :: Total of returned lines from RDDs :: " + returnedValues.size());

    // In the case of use a reducer the final output has to be stored in just one file
    if (bwa.isUseReducer()) {
        combineOutputSamFiles(bwa.getOutputHdfsDir(), returnedValues);
    } else {
        for (String outputFile : returnedValues) {
            LOG.info("JMAbuin:: SparkBWA:: Returned file ::" + outputFile);
        }
    }

    //After the execution, if the inputTmp exists, it should be deleted
    try {
        if ((this.inputTmpFileName != null) && (!this.inputTmpFileName.isEmpty())) {
            FileSystem fs = FileSystem.get(this.conf);

            fs.delete(new Path(this.inputTmpFileName), true);

            fs.close();
        }

    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        LOG.error(e.toString());

    }
}

From source file:BwaInterpreter.java

License:Open Source License

/**
 * Used to perform the sort operation in HDFS
 * @brief This function provides a method to perform the sort phase in HDFS
 * @author Jos M. Abun//  www  .  j a v  a2  s  .  co m
 * @param fileName1 The first file that contains input FASTQ reads. Stored in HDFS
 * @param fileName2 The second file that contains input FASTQ reads. Stored in HDFS
 * @return A JavaRDD that contains the paired reads sorted
 */
public JavaRDD<Tuple2<String, String>> SortInHDFS2(String fileName1, String fileName2) {

    Configuration conf = this.conf;

    LOG.info("JMAbuin:: Starting writing reads to HDFS");

    try {
        FileSystem fs = FileSystem.get(conf);

        Path outputFilePath = new Path(this.inputTmpFileName);

        //To write the paired reads
        FSDataOutputStream outputFinalStream = fs.create(outputFilePath, true);

        //To read paired reads from both files
        BufferedReader brFastqFile1 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName1))));
        BufferedReader brFastqFile2 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName2))));

        String lineFastq1;
        String lineFastq2;

        lineFastq1 = brFastqFile1.readLine();
        lineFastq2 = brFastqFile2.readLine();

        //Loop to read two files. The two of them must have the same line numbers
        while (lineFastq1 != null) {
            //The lines are written interspersed
            outputFinalStream.write((lineFastq1 + "\n" + lineFastq2 + "\n").getBytes());

            //Next lines are readed
            lineFastq1 = brFastqFile1.readLine();
            lineFastq2 = brFastqFile2.readLine();
        }

        //Close the input and output files
        brFastqFile1.close();
        brFastqFile2.close();
        outputFinalStream.close();

        //Now it is time to read the previous created file and create the RDD
        ContentSummary cSummary = fs.getContentSummary(outputFilePath);

        long length = cSummary.getLength();

        this.totalInputLength = length;

        fs.close();

        //In case of the user does want partitioning
        if (this.options.getPartitionNumber() != 0) {

            //These options are set to indicate the split size and get the correct vnumber of partitions
            this.conf.set("mapreduce.input.fileinputformat.split.maxsize",
                    String.valueOf((length) / this.options.getPartitionNumber()));
            this.conf.set("mapreduce.input.fileinputformat.split.minsize",
                    String.valueOf((length) / this.options.getPartitionNumber()));

            LOG.info("JMAbuin partitioning from HDFS:: "
                    + String.valueOf((length) / this.options.getPartitionNumber()));

            //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD
            return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class,
                    String.class, this.conf).mapPartitions(new BigFastq2RDDPartitionsDouble(), true);

        } else {
            //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD
            return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class,
                    String.class, this.conf).map(new BigFastq2RDDDouble());
        }

    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        LOG.error(e.toString());

        return null;
    }
}

From source file:TestStringRelevance.java

License:Apache License

@Override
public void setUp() throws Exception {
    fs.delete(new Path(INPUT), true);
    fs.delete(new Path(QUERY), true);
    fs.delete(new Path(OUTPUT), true);

    inputTap = new Hfs(new SequenceFile(new Fields("str1", "str2")), INPUT);
    TapCollector coll = new TapCollector(inputTap, new JobConf());
    coll.add(tuple1);/*from w  w w. ja  va2s.c om*/
    coll.add(tuple2);
    coll.add(tuple3);
    coll.add(tuple4);
    coll.add(tuple5);
    coll.add(tuple6);
    coll.add(tuple7);
    coll.add(tuple8);
    coll.add(tuple9);
    coll.close();

    keyTap = new Hfs(new SequenceFile(new Fields("str")), QUERY);
    coll = new TapCollector(keyTap, new JobConf());
    coll.add(new Tuple(new Text("nathan@rapleaf.com")));
    coll.add(new Tuple(new Text("1@gmail.com")));
    coll.add(new Tuple(new Text("2@gmail.com")));
    coll.add(new Tuple(new Text("6@gmail.com")));
    coll.close();

    outputTap = new Hfs(new SequenceFile(new Fields("str1", "str2")), OUTPUT);
}

From source file:LinkReverser.java

License:Apache License

/**
 * The main driver for word count map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.//ww  w .  j  a  v  a2  s  . c om
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), LinkReverser.class);
    conf.setJobName("indexreverser");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                conf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                conf.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else {
                other_args.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}

From source file:WikipediaDocnoMappingBuilder.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from  www .j  av a  2 s  .c  om*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file")
            .create(OUTPUT_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));
    options.addOption(KEEP_ALL_OPTION, false, "keep all pages");

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION);
    boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION);

    String tmpPath = "tmp-" + WikipediaDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output file: " + outputFile);
    LOG.info(" - keep all pages: " + keepAll);
    LOG.info(" - language: " + language);

    // Job job = Job.getInstance(getConf());
    JobConf conf = new JobConf(WikipediaDocnoMappingBuilder.class);
    conf.setJarByClass(WikipediaDocnoMappingBuilder.class);
    conf.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language));

    conf.setBoolean(KEEP_ALL_OPTION, keepAll);
    // .getConfiguration().setBoolean(KEEP_ALL_OPTION, keepAll);
    if (language != null) {
        conf.set("wiki.language", language);
    }
    conf.setNumReduceTasks(1);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setInputFormat(WikipediaPageInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(tmpPath), true);

    // job.waitForCompletion(true);

    RunningJob job = JobClient.runJob(conf);
    job.waitForCompletion();

    // JobClient jobClient = new JobClient(conf);
    long cnt = keepAll ? job.getCounters().findCounter(PageTypes.TOTAL).getValue()
            : job.getCounters().findCounter(PageTypes.ARTICLE).getValue();

    WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(getConf()), tmpPath + "/part-00000", (int) cnt,
            outputFile);

    FileSystem.get(getConf()).delete(new Path(tmpPath), true);

    return 0;
}

From source file:DescSorter.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: flights <in> <in> <out>");
        System.exit(2);/*w  w  w .  jav  a2s . c o m*/
    }
    Job job = new Job(conf, "AvgDelays");
    job.setJarByClass(DescSorter.class);
    job.setMapperClass(FlightMapper.class);

    job.setMapOutputKeyClass(CompositeKey.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setPartitionerClass(CompositeKeyPartitioner.class);
    job.setSortComparatorClass(SortComparator.class);
    job.setGroupingComparatorClass(GroupingComparator.class);

    job.setReducerClass(AvgDelayReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:CalculateHistogram.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);/*from   www . j  ava 2 s  . co m*/
    }

    Job job = new Job(conf, "MRDT - Generate Histogram");
    job.setJarByClass(CalculateHistogram.class);
    job.setMapperClass(HistogramMap.class);
    job.setReducerClass(HistogramReduce.class);

    //job.setOutputValueClass(HistogramBucket.class);

    //job.setMapOutputKeyClass(LongWritable.class);
    //job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:SingleFileWriter.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("SingleFileWriter [fileSize ie. 1g/10g/100g]");
        return 1;
    }// w  w w .  ja  va  2  s . c om

    double fileSize = Double.parseDouble((args[0].split("g|G"))[0]) * 1024 * 1024 * 1024;

    String hdfsFolder = "/hdfs_test/";
    String hdfsFile = hdfsFolder + args[0];
    short replication = 1;
    boolean overWrite = true;
    int bufferSize = 65536;
    int blockSize = 536870912;
    double numIters = fileSize / (double) bufferSize;

    /* Initialize byte buffer */
    ByteBuffer buf = ByteBuffer.allocate(bufferSize);
    buf.order(ByteOrder.nativeOrder());
    for (int k = 0; k < bufferSize / Integer.SIZE; k++) {
        buf.putInt(k);
    }
    buf.flip();

    /* Create file on HDFS */
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path hdfsFilePath = new Path(hdfsFile);
    OutputStream os = fs.create(hdfsFilePath, overWrite, bufferSize, replication, blockSize);
    /* Write the content of the byte buffer 
     to the HDFS file*/
    Timer t = new Timer();
    t.start(0);
    for (long i = 0; i < numIters; i++) {
        os.write(buf.array());
        buf.flip();
    }
    t.end(0);
    os.close();
    fs.delete(hdfsFilePath, true);

    t.dump();

    return 0;
}

From source file:DumpPageRankRecordsToPlainText.java

License:Apache License

/**
 * Runs this tool./*from  w  ww .ja v a2s  .c  om*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);

    LOG.info("Tool name: " + DumpPageRankRecordsToPlainText.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    Configuration conf = new Configuration();
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    Job job = Job.getInstance(conf);
    job.setJobName(DumpPageRankRecordsToPlainText.class.getSimpleName());
    job.setJarByClass(DumpPageRankRecordsToPlainText.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}