List of usage examples for org.apache.hadoop.fs Path Path
public Path(URI aUri)
From source file:BwaInterpreter.java
License:Open Source License
private void combineOutputSamFiles(String outputHdfsDir, List<String> returnedValues) { try {/*from w ww .j a va 2 s. c om*/ Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path finalHdfsOutputFile = new Path(outputHdfsDir + "/FullOutput.sam"); FSDataOutputStream outputFinalStream = fs.create(finalHdfsOutputFile, true); // We iterate over the resulting files in HDFS and agregate them into only one file. for (int i = 0; i < returnedValues.size(); i++) { LOG.info("JMAbuin:: SparkBWA :: Returned file ::" + returnedValues.get(i)); BufferedReader br = new BufferedReader( new InputStreamReader(fs.open(new Path(returnedValues.get(i))))); String line; line = br.readLine(); while (line != null) { if (i == 0 || !line.startsWith("@")) { //outputFinalStream.writeBytes(line+"\n"); outputFinalStream.write((line + "\n").getBytes()); } line = br.readLine(); } br.close(); fs.delete(new Path(returnedValues.get(i)), true); } outputFinalStream.close(); fs.close(); } catch (IOException e) { e.printStackTrace(); LOG.error(e.toString()); } }
From source file:BwaInterpreter.java
License:Open Source License
/** * Runs BWA with the specified options//from w w w . jav a2s.c om * @brief This function runs BWA with the input data selected and with the options also selected by the user. */ public void RunBwa() { LOG.info("JMAbuin:: Starting BWA"); Bwa bwa = new Bwa(this.options); List<String> returnedValues; if (bwa.isPairedReads()) { JavaRDD<Tuple2<String, String>> readsRDD = handlePairedReadsSorting(); returnedValues = MapPairedBwa(bwa, readsRDD); } else { JavaRDD<String> readsRDD = handleSingleReadsSorting(); returnedValues = MapSingleBwa(bwa, readsRDD); } LOG.info("BwaRDD :: Total of returned lines from RDDs :: " + returnedValues.size()); // In the case of use a reducer the final output has to be stored in just one file if (bwa.isUseReducer()) { combineOutputSamFiles(bwa.getOutputHdfsDir(), returnedValues); } else { for (String outputFile : returnedValues) { LOG.info("JMAbuin:: SparkBWA:: Returned file ::" + outputFile); } } //After the execution, if the inputTmp exists, it should be deleted try { if ((this.inputTmpFileName != null) && (!this.inputTmpFileName.isEmpty())) { FileSystem fs = FileSystem.get(this.conf); fs.delete(new Path(this.inputTmpFileName), true); fs.close(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); LOG.error(e.toString()); } }
From source file:BwaInterpreter.java
License:Open Source License
/** * Used to perform the sort operation in HDFS * @brief This function provides a method to perform the sort phase in HDFS * @author Jos M. Abun// www . j a v a2 s . co m * @param fileName1 The first file that contains input FASTQ reads. Stored in HDFS * @param fileName2 The second file that contains input FASTQ reads. Stored in HDFS * @return A JavaRDD that contains the paired reads sorted */ public JavaRDD<Tuple2<String, String>> SortInHDFS2(String fileName1, String fileName2) { Configuration conf = this.conf; LOG.info("JMAbuin:: Starting writing reads to HDFS"); try { FileSystem fs = FileSystem.get(conf); Path outputFilePath = new Path(this.inputTmpFileName); //To write the paired reads FSDataOutputStream outputFinalStream = fs.create(outputFilePath, true); //To read paired reads from both files BufferedReader brFastqFile1 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName1)))); BufferedReader brFastqFile2 = new BufferedReader(new InputStreamReader(fs.open(new Path(fileName2)))); String lineFastq1; String lineFastq2; lineFastq1 = brFastqFile1.readLine(); lineFastq2 = brFastqFile2.readLine(); //Loop to read two files. The two of them must have the same line numbers while (lineFastq1 != null) { //The lines are written interspersed outputFinalStream.write((lineFastq1 + "\n" + lineFastq2 + "\n").getBytes()); //Next lines are readed lineFastq1 = brFastqFile1.readLine(); lineFastq2 = brFastqFile2.readLine(); } //Close the input and output files brFastqFile1.close(); brFastqFile2.close(); outputFinalStream.close(); //Now it is time to read the previous created file and create the RDD ContentSummary cSummary = fs.getContentSummary(outputFilePath); long length = cSummary.getLength(); this.totalInputLength = length; fs.close(); //In case of the user does want partitioning if (this.options.getPartitionNumber() != 0) { //These options are set to indicate the split size and get the correct vnumber of partitions this.conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf((length) / this.options.getPartitionNumber())); this.conf.set("mapreduce.input.fileinputformat.split.minsize", String.valueOf((length) / this.options.getPartitionNumber())); LOG.info("JMAbuin partitioning from HDFS:: " + String.valueOf((length) / this.options.getPartitionNumber())); //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class, String.class, this.conf).mapPartitions(new BigFastq2RDDPartitionsDouble(), true); } else { //Using the FastqInputFormatDouble class we get values from the HDFS file. After that, these values are stored in a RDD return this.ctx.newAPIHadoopFile(this.inputTmpFileName, FastqInputFormatDouble.class, Long.class, String.class, this.conf).map(new BigFastq2RDDDouble()); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); LOG.error(e.toString()); return null; } }
From source file:TestStringRelevance.java
License:Apache License
@Override public void setUp() throws Exception { fs.delete(new Path(INPUT), true); fs.delete(new Path(QUERY), true); fs.delete(new Path(OUTPUT), true); inputTap = new Hfs(new SequenceFile(new Fields("str1", "str2")), INPUT); TapCollector coll = new TapCollector(inputTap, new JobConf()); coll.add(tuple1);/*from w w w. ja va2s.c om*/ coll.add(tuple2); coll.add(tuple3); coll.add(tuple4); coll.add(tuple5); coll.add(tuple6); coll.add(tuple7); coll.add(tuple8); coll.add(tuple9); coll.close(); keyTap = new Hfs(new SequenceFile(new Fields("str")), QUERY); coll = new TapCollector(keyTap, new JobConf()); coll.add(new Tuple(new Text("nathan@rapleaf.com"))); coll.add(new Tuple(new Text("1@gmail.com"))); coll.add(new Tuple(new Text("2@gmail.com"))); coll.add(new Tuple(new Text("6@gmail.com"))); coll.close(); outputTap = new Hfs(new SequenceFile(new Fields("str1", "str2")), OUTPUT); }
From source file:LinkReverser.java
License:Apache License
/** * The main driver for word count map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker.//ww w . j a v a2 s . c om */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), LinkReverser.class); conf.setJobName("indexreverser"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:WikipediaDocnoMappingBuilder.java
License:Apache License
@SuppressWarnings("static-access") @Override/*from www .j av a 2 s .c om*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file") .create(OUTPUT_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() .withDescription("two-letter language code").create(LANGUAGE_OPTION)); options.addOption(KEEP_ALL_OPTION, false, "keep all pages"); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION); boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION); String tmpPath = "tmp-" + WikipediaDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input: " + inputPath); LOG.info(" - output file: " + outputFile); LOG.info(" - keep all pages: " + keepAll); LOG.info(" - language: " + language); // Job job = Job.getInstance(getConf()); JobConf conf = new JobConf(WikipediaDocnoMappingBuilder.class); conf.setJarByClass(WikipediaDocnoMappingBuilder.class); conf.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language)); conf.setBoolean(KEEP_ALL_OPTION, keepAll); // .getConfiguration().setBoolean(KEEP_ALL_OPTION, keepAll); if (language != null) { conf.set("wiki.language", language); } conf.setNumReduceTasks(1); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(tmpPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IntWritable.class); conf.setInputFormat(WikipediaPageInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(tmpPath), true); // job.waitForCompletion(true); RunningJob job = JobClient.runJob(conf); job.waitForCompletion(); // JobClient jobClient = new JobClient(conf); long cnt = keepAll ? job.getCounters().findCounter(PageTypes.TOTAL).getValue() : job.getCounters().findCounter(PageTypes.ARTICLE).getValue(); WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(getConf()), tmpPath + "/part-00000", (int) cnt, outputFile); FileSystem.get(getConf()).delete(new Path(tmpPath), true); return 0; }
From source file:DescSorter.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: flights <in> <in> <out>"); System.exit(2);/*w w w . jav a2s . c o m*/ } Job job = new Job(conf, "AvgDelays"); job.setJarByClass(DescSorter.class); job.setMapperClass(FlightMapper.class); job.setMapOutputKeyClass(CompositeKey.class); job.setMapOutputValueClass(IntWritable.class); job.setPartitionerClass(CompositeKeyPartitioner.class); job.setSortComparatorClass(SortComparator.class); job.setGroupingComparatorClass(GroupingComparator.class); job.setReducerClass(AvgDelayReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:CalculateHistogram.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);/*from www . j ava 2 s . co m*/ } Job job = new Job(conf, "MRDT - Generate Histogram"); job.setJarByClass(CalculateHistogram.class); job.setMapperClass(HistogramMap.class); job.setReducerClass(HistogramReduce.class); //job.setOutputValueClass(HistogramBucket.class); //job.setMapOutputKeyClass(LongWritable.class); //job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:SingleFileWriter.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 1) { System.err.println("SingleFileWriter [fileSize ie. 1g/10g/100g]"); return 1; }// w w w . ja va 2 s . c om double fileSize = Double.parseDouble((args[0].split("g|G"))[0]) * 1024 * 1024 * 1024; String hdfsFolder = "/hdfs_test/"; String hdfsFile = hdfsFolder + args[0]; short replication = 1; boolean overWrite = true; int bufferSize = 65536; int blockSize = 536870912; double numIters = fileSize / (double) bufferSize; /* Initialize byte buffer */ ByteBuffer buf = ByteBuffer.allocate(bufferSize); buf.order(ByteOrder.nativeOrder()); for (int k = 0; k < bufferSize / Integer.SIZE; k++) { buf.putInt(k); } buf.flip(); /* Create file on HDFS */ Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Path hdfsFilePath = new Path(hdfsFile); OutputStream os = fs.create(hdfsFilePath, overWrite, bufferSize, replication, blockSize); /* Write the content of the byte buffer to the HDFS file*/ Timer t = new Timer(); t.start(0); for (long i = 0; i < numIters; i++) { os.write(buf.array()); buf.flip(); } t.end(0); os.close(); fs.delete(hdfsFilePath, true); t.dump(); return 0; }
From source file:DumpPageRankRecordsToPlainText.java
License:Apache License
/** * Runs this tool./*from w ww .ja v a2s .c om*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); LOG.info("Tool name: " + DumpPageRankRecordsToPlainText.class.getSimpleName()); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); Configuration conf = new Configuration(); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); Job job = Job.getInstance(conf); job.setJobName(DumpPageRankRecordsToPlainText.class.getSimpleName()); job.setJarByClass(DumpPageRankRecordsToPlainText.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }