Example usage for org.apache.hadoop.fs Path Path

List of usage examples for org.apache.hadoop.fs Path Path

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path Path.

Prototype

public Path(URI aUri) 

Source Link

Document

Construct a path from a URI

Usage

From source file:BwaInterpreter.java

License:Open Source License

private void combineOutputSamFiles(String outputHdfsDir, List<String> returnedValues) {
    try {/*from w ww .j  a va  2  s. c  om*/
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        Path finalHdfsOutputFile = new Path(outputHdfsDir + "/FullOutput.sam");
        FSDataOutputStream outputFinalStream = fs.create(finalHdfsOutputFile, true);

        // We iterate over the resulting files in HDFS and agregate them into only one file.
        for (int i = 0; i < returnedValues.size(); i++) {
            LOG.info("JMAbuin:: SparkBWA :: Returned file ::" + returnedValues.get(i));
            BufferedReader br = new BufferedReader(
                    new InputStreamReader(fs.open(new Path(returnedValues.get(i)))));

            String line;
            line = br.readLine();

            while (line != null) {
                if (i == 0 || !line.startsWith("@")) {
                    //outputFinalStream.writeBytes(line+"\n");
                    outputFinalStream.write((line + "\n").getBytes());
                }

                line = br.readLine();
            }
            br.close();

            fs.delete(new Path(returnedValues.get(i)), true);
        }

        outputFinalStream.close();
        fs.close();
    } catch (IOException e) {
        e.printStackTrace();
        LOG.error(e.toString());
    }
}

From source file:BwaInterpreter.java

License:Open Source License

/**
 * Runs BWA with the specified options//from  w  w w  .  jav a2s.c  om
 * @brief This function runs BWA with the input data selected and with the options also selected by the user.
 */
public void RunBwa() {
    LOG.info("JMAbuin:: Starting BWA");
    Bwa bwa = new Bwa(this.options);

    List<String> returnedValues;
    if (bwa.isPairedReads()) {
        JavaRDD<Tuple2<String, String>> readsRDD = handlePairedReadsSorting();
        returnedValues = MapPairedBwa(bwa, readsRDD);
    } else {
        JavaRDD<String> readsRDD = handleSingleReadsSorting();
        returnedValues = MapSingleBwa(bwa, readsRDD);
    }

    LOG.info("BwaRDD :: Total of returned lines from RDDs :: " + returnedValues.size());

    // In the case of use a reducer the final output has to be stored in just one file
    if (bwa.isUseReducer()) {
        combineOutputSamFiles(bwa.getOutputHdfsDir(), returnedValues);
    } else {
        for (String outputFile : returnedValues) {
            LOG.info("JMAbuin:: SparkBWA:: Returned file ::" + outputFile);
        }
    }

    //After the execution, if the inputTmp exists, it should be deleted
    try {
        if ((this.inputTmpFileName != null) && (!this.inputTmpFileName.isEmpty())) {
            FileSystem fs = FileSystem.get(this.conf);

            fs.delete(new Path(this.inputTmpFileName), true);

            fs.close();
        }

    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        LOG.error(e.toString());

    }
}

From source file:BwaInterpreter.java

License:Open Source License

/**
 * Used to perform the sort operation in HDFS
 * @brief This function provides a method to perform the sort phase in HDFS
 * @author Jos M. Abun//  www  .  j a v  a2  s  .  co m
 * @param fileName1 The first file that contains input FASTQ reads. Stored in HDFS
 * @param fileName2 The second file that contains input FASTQ reads. Stored in HDFS
 * @return A JavaRDD that contains the paired reads sorted
 */
public JavaRDD<Tuple2<String, String>> SortInHDFS2(String fileName1, String fileName2) {

    Configuration conf = this.conf;

    LOG.info("JMAbuin:: Starting writing reads to HDFS");

    try {
        FileSystem fs = FileSystem.get(conf);

        Path outputFilePath = new Path(this.inputTmpFileName);

        //To write the paired reads
        FSDataOutputStream outputFinalStream = fs.create(outputFilePath, true);

        //To read paired reads from both files
        BufferedReader brFastqFile1 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName1))));
        BufferedReader brFastqFile2 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName2))));

        String lineFastq1;
        String lineFastq2;

        lineFastq1 = brFastqFile1.readLine();
        lineFastq2 = brFastqFile2.readLine();

        //Loop to read two files. The two of them must have the same line numbers
        while (lineFastq1 != null) {
            //The lines are written interspersed
            outputFinalStream.write((lineFastq1 + "\n" + lineFastq2 + "\n").getBytes());

            //Next lines are readed
            lineFastq1 = brFastqFile1.readLine();
            lineFastq2 = brFastqFile2.readLine();
        }

        //Close the input and output files
        brFastqFile1.close();
        brFastqFile2.close();
        outputFinalStream.close();

        //Now it is time to read the previous created file and create the RDD
        ContentSummary cSummary = fs.getContentSummary(outputFilePath);

        long length = cSummary.getLength();

        this.totalInputLength = length;

        fs.close();

        //In case of the user does want partitioning
        if (this.options.getPartitionNumber() != 0) {

            //These options are set to indicate the split size and get the correct vnumber of partitions
            this.conf.set("mapreduce.input.fileinputformat.split.maxsize",
                    String.valueOf((length) / this.options.getPartitionNumber()));
            this.conf.set("mapreduce.input.fileinputformat.split.minsize",
                    String.valueOf((length) / this.options.getPartitionNumber()));

            LOG.info("JMAbuin partitioning from HDFS:: "
                    + String.valueOf((length) / this.options.getPartitionNumber()));

            //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD
            return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class,
                    String.class, this.conf).mapPartitions(new BigFastq2RDDPartitionsDouble(), true);

        } else {
            //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD
            return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class,
                    String.class, this.conf).map(new BigFastq2RDDDouble());
        }

    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        LOG.error(e.toString());

        return null;
    }
}

From source file:TestStringRelevance.java

License:Apache License

@Override
public void setUp() throws Exception {
    fs.delete(new Path(INPUT), true);
    fs.delete(new Path(QUERY), true);
    fs.delete(new Path(OUTPUT), true);

    inputTap = new Hfs(new SequenceFile(new Fields("str1", "str2")), INPUT);
    TapCollector coll = new TapCollector(inputTap, new JobConf());
    coll.add(tuple1);/*from w  w w. ja  va2s.c om*/
    coll.add(tuple2);
    coll.add(tuple3);
    coll.add(tuple4);
    coll.add(tuple5);
    coll.add(tuple6);
    coll.add(tuple7);
    coll.add(tuple8);
    coll.add(tuple9);
    coll.close();

    keyTap = new Hfs(new SequenceFile(new Fields("str")), QUERY);
    coll = new TapCollector(keyTap, new JobConf());
    coll.add(new Tuple(new Text("nathan@rapleaf.com")));
    coll.add(new Tuple(new Text("1@gmail.com")));
    coll.add(new Tuple(new Text("2@gmail.com")));
    coll.add(new Tuple(new Text("6@gmail.com")));
    coll.close();

    outputTap = new Hfs(new SequenceFile(new Fields("str1", "str2")), OUTPUT);
}

From source file:LinkReverser.java

License:Apache License

/**
 * The main driver for word count map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.//ww  w .  j  a  v  a2  s  . c om
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), LinkReverser.class);
    conf.setJobName("indexreverser");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                conf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                conf.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else {
                other_args.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);
    return 0;
}

From source file:WikipediaDocnoMappingBuilder.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from  www .j  av a  2 s  .c  om*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file")
            .create(OUTPUT_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));
    options.addOption(KEEP_ALL_OPTION, false, "keep all pages");

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION);
    boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION);

    String tmpPath = "tmp-" + WikipediaDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output file: " + outputFile);
    LOG.info(" - keep all pages: " + keepAll);
    LOG.info(" - language: " + language);

    // Job job = Job.getInstance(getConf());
    JobConf conf = new JobConf(WikipediaDocnoMappingBuilder.class);
    conf.setJarByClass(WikipediaDocnoMappingBuilder.class);
    conf.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language));

    conf.setBoolean(KEEP_ALL_OPTION, keepAll);
    // .getConfiguration().setBoolean(KEEP_ALL_OPTION, keepAll);
    if (language != null) {
        conf.set("wiki.language", language);
    }
    conf.setNumReduceTasks(1);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setInputFormat(WikipediaPageInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(tmpPath), true);

    // job.waitForCompletion(true);

    RunningJob job = JobClient.runJob(conf);
    job.waitForCompletion();

    // JobClient jobClient = new JobClient(conf);
    long cnt = keepAll ? job.getCounters().findCounter(PageTypes.TOTAL).getValue()
            : job.getCounters().findCounter(PageTypes.ARTICLE).getValue();

    WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(getConf()), tmpPath + "/part-00000", (int) cnt,
            outputFile);

    FileSystem.get(getConf()).delete(new Path(tmpPath), true);

    return 0;
}

From source file:DescSorter.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: flights <in> <in> <out>");
        System.exit(2);/*w  w  w .  jav  a2s . c o m*/
    }
    Job job = new Job(conf, "AvgDelays");
    job.setJarByClass(DescSorter.class);
    job.setMapperClass(FlightMapper.class);

    job.setMapOutputKeyClass(CompositeKey.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setPartitionerClass(CompositeKeyPartitioner.class);
    job.setSortComparatorClass(SortComparator.class);
    job.setGroupingComparatorClass(GroupingComparator.class);

    job.setReducerClass(AvgDelayReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);

    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:CalculateHistogram.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);/*from   www . j  ava 2 s  . co m*/
    }

    Job job = new Job(conf, "MRDT - Generate Histogram");
    job.setJarByClass(CalculateHistogram.class);
    job.setMapperClass(HistogramMap.class);
    job.setReducerClass(HistogramReduce.class);

    //job.setOutputValueClass(HistogramBucket.class);

    //job.setMapOutputKeyClass(LongWritable.class);
    //job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:SingleFileWriter.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("SingleFileWriter [fileSize ie. 1g/10g/100g]");
        return 1;
    }// w  w w .  ja  va  2  s . c om

    double fileSize = Double.parseDouble((args[0].split("g|G"))[0]) * 1024 * 1024 * 1024;

    String hdfsFolder = "/hdfs_test/";
    String hdfsFile = hdfsFolder + args[0];
    short replication = 1;
    boolean overWrite = true;
    int bufferSize = 65536;
    int blockSize = 536870912;
    double numIters = fileSize / (double) bufferSize;

    /* Initialize byte buffer */
    ByteBuffer buf = ByteBuffer.allocate(bufferSize);
    buf.order(ByteOrder.nativeOrder());
    for (int k = 0; k < bufferSize / Integer.SIZE; k++) {
        buf.putInt(k);
    }
    buf.flip();

    /* Create file on HDFS */
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path hdfsFilePath = new Path(hdfsFile);
    OutputStream os = fs.create(hdfsFilePath, overWrite, bufferSize, replication, blockSize);
    /* Write the content of the byte buffer 
     to the HDFS file*/
    Timer t = new Timer();
    t.start(0);
    for (long i = 0; i < numIters; i++) {
        os.write(buf.array());
        buf.flip();
    }
    t.end(0);
    os.close();
    fs.delete(hdfsFilePath, true);

    t.dump();

    return 0;
}

From source file:DumpPageRankRecordsToPlainText.java

License:Apache License

/**
 * Runs this tool./*from  w  ww .ja v a2s  .c  om*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);

    LOG.info("Tool name: " + DumpPageRankRecordsToPlainText.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    Configuration conf = new Configuration();
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    Job job = Job.getInstance(conf);
    job.setJobName(DumpPageRankRecordsToPlainText.class.getSimpleName());
    job.setJarByClass(DumpPageRankRecordsToPlainText.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}