List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:HdfsCacheReader.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 1) { System.err.println("HdfsReader [FileSize i.e. 1g/10g/100g/200g]"); return 1; }//from www. j av a 2 s . c o m double fileSize; double fileSizeInMB; if (args[0].equals("1g")) { fileSize = 1073741824.0; fileSizeInMB = 1024.0; } else if (args[0].equals("10g")) { fileSize = 10737418240.0; fileSizeInMB = 10240.0; } else if (args[0].equals("100g")) { fileSize = 107374182400.0; fileSizeInMB = 102400.0; } else if (args[0].equals("200g")) { fileSize = 214748364800.0; fileSizeInMB = 204800.0; } else { throw new IllegalArgumentException("Invalid arg: " + args[0]); } String fileName = "cacheRead-" + args[0] + "-avg.txt"; File avgFile = new File(fileName); PrintWriter avgPW = new PrintWriter(avgFile); fileName = "cacheRead-" + args[0] + "-min.txt"; File minFile = new File(fileName); PrintWriter minPW = new PrintWriter(minFile); fileName = "cacheRead-" + args[0] + "-max.txt"; File maxFile = new File(fileName); PrintWriter maxPW = new PrintWriter(maxFile); int numIters = 10; int bufferSize = 65536; long blockSize[] = new long[] { 67108864, 134217728, 268435456, 536870912, 1073741824 }; short replication[] = new short[] { 1, 4 }; String hdfsFile = "/hdfs_test/" + args[0] + "/1.in"; Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Path hdfsFilePath = new Path(hdfsFile); for (int i = 0; i < 5; i++) { // blockSize for (int j = 0; j < 2; j++) { // replication OutputStream os = fs.create(hdfsFilePath, true, bufferSize, replication[j], blockSize[i]); byte[] buf = new byte[bufferSize]; for (int m = 0; m < bufferSize; m += 4) { buf[m] = (byte) m; } double numBufPerFile = fileSize / (double) bufferSize; for (double m = 0.0; m < numBufPerFile; m++) { os.write(buf); } os.close(); String cmdStr = "/usr/local/hadoop/bin/hdfs cacheadmin -addDirective -path " + hdfsFile + " -pool hdfs_test"; Process p = Runtime.getRuntime().exec(cmdStr); p.waitFor(); String cmdOutLine = ""; StringBuffer cmdOut = new StringBuffer(); BufferedReader cmdOutReader = new BufferedReader(new InputStreamReader(p.getInputStream())); while ((cmdOutLine = cmdOutReader.readLine()) != null) { cmdOut.append(cmdOutLine + "\n"); } // System.out.println (cmdOut.toString()); long avg = 0, min = Long.MAX_VALUE, max = Long.MIN_VALUE; for (int k = 0; k < numIters; k++) { FSDataInputStream in = fs.open(hdfsFilePath); ByteBuffer bbuf = null; ElasticByteBufferPool ebbp = new ElasticByteBufferPool(); long startTime = System.currentTimeMillis(); while ((bbuf = in.read(ebbp, bufferSize, EnumSet.of(ReadOption.SKIP_CHECKSUMS))) != null) { in.releaseBuffer(bbuf); } long endTime = System.currentTimeMillis(); in.close(); long duration = (endTime - startTime); avg += duration; if (duration < min) { min = duration; } if (duration > max) { max = duration; } } // write result to output double avgBW = fileSizeInMB * 1000.0 * (double) numIters / (double) avg; avgPW.print(avgBW); avgPW.print("\t"); double minBW = fileSizeInMB * 1000.0 / (double) max; minPW.print(minBW); minPW.print("\t"); double maxBW = fileSizeInMB * 1000.0 / (double) min; maxPW.print(maxBW); maxPW.print("\t"); cmdStr = "/usr/local/hadoop/bin/hdfs cacheadmin -removeDirectives -path " + hdfsFile; p = Runtime.getRuntime().exec(cmdStr); p.waitFor(); cmdOutLine = ""; cmdOut.setLength(0); cmdOutReader = new BufferedReader(new InputStreamReader(p.getInputStream())); while ((cmdOutLine = cmdOutReader.readLine()) != null) { cmdOut.append(cmdOutLine + "\n"); } // System.out.println (cmdOut.toString()); fs.delete(hdfsFilePath, true); } avgPW.println(); minPW.println(); maxPW.println(); } avgPW.close(); minPW.close(); maxPW.close(); return 0; }
From source file:RunPageRankBasic.java
License:Apache License
private float phase1(int i, int j, String basePath, int numNodes, boolean useCombiner, boolean useInMapperCombiner) throws Exception { Job job = Job.getInstance(getConf()); job.setJobName("PageRank:Basic:iteration" + j + ":Phase1"); job.setJarByClass(RunPageRankBasic.class); String in = basePath + "/iter" + formatter.format(i); String out = basePath + "/iter" + formatter.format(j) + "t"; String outm = out + "-mass"; // We need to actually count the number of part files to get the number of partitions (because // the directory might contain _log). int numPartitions = 0; for (FileStatus s : FileSystem.get(getConf()).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) numPartitions++;// ww w . ja v a 2s . c o m } LOG.info("PageRank: iteration " + j + ": Phase1"); LOG.info(" - input: " + in); LOG.info(" - output: " + out); LOG.info(" - nodeCnt: " + numNodes); LOG.info(" - useCombiner: " + useCombiner); LOG.info(" - useInmapCombiner: " + useInMapperCombiner); LOG.info("computed number of partitions: " + numPartitions); int numReduceTasks = numPartitions; job.getConfiguration().setInt("NodeCount", numNodes); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); //job.getConfiguration().set("mapred.child.java.opts", "-Xmx2048m"); job.getConfiguration().set("PageRankMassPath", outm); job.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); job.setMapperClass(useInMapperCombiner ? MapWithInMapperCombiningClass.class : MapClass.class); if (useCombiner) { job.setCombinerClass(CombineClass.class); } job.setReducerClass(ReduceClass.class); FileSystem.get(getConf()).delete(new Path(out), true); FileSystem.get(getConf()).delete(new Path(outm), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); float mass = Float.NEGATIVE_INFINITY; FileSystem fs = FileSystem.get(getConf()); for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:DijikstraAlgo.java
License:GNU General Public License
public static void run(String[] args) throws Exception { IN = "hdfs://10.8.3.161:9000/user/sagar/input/"; OUT = "hdfs://10.8.3.161:9000/user/sagar/output/"; String input = IN;/*from w w w.j a v a 2 s. c o m*/ String output = OUT + System.nanoTime(); String MAX_SPLIT_SIZE = args[0]; boolean isdone = false; // Reiteration again and again till the convergence while (isdone == false) { JobConf conf = new JobConf(DijikstraAlgo.class); conf.setJobName("Dijikstra"); // conf.set("mapred.max.split.size", MAX_SPLIT_SIZE); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); input = output + "/part-00000"; isdone = true;// set the job to NOT run again! Path ofile = new Path(input); FileSystem fs = FileSystem.get(new URI("hdfs://10.8.3.165:9000"), conf); //FileSystem fs = FileSystem.get(new Configuration()); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(ofile))); HashMap<Integer, Integer> imap = new HashMap<Integer, Integer>(); String line = br.readLine(); // Read the current output file and put it into HashMap while (line != null) { String[] sp = line.split("\t| "); int node = Integer.parseInt(sp[0]); int distance = Integer.parseInt(sp[1]); imap.put(node, distance); line = br.readLine(); } br.close(); // Check for convergence condition if any node is still left then // continue else stop Iterator<Integer> itr = imap.keySet().iterator(); while (itr.hasNext()) { int key = itr.next(); int value = imap.get(key); if (value >= 125) { isdone = false; } } input = output; output = OUT + System.nanoTime(); } }
From source file:HDFSRandomAccessFile.java
License:Apache License
public HDFSRandomAccessFile(String fileSystemURI, String location, int bufferSize) throws IOException { super(bufferSize); fsURI = URI.create(fileSystemURI); filePath = new Path(location); this.location = location; if (debugLeaks) { openFiles.add(location);//w w w.j a va 2 s.c o m } FileSystem fs = FileSystem.get(fsURI, new Configuration()); hfile = fs.open(filePath); fileStatus = fs.getFileStatus(filePath); }
From source file:LookupPostingsCompressed1.java
License:Apache License
@SuppressWarnings({ "static-access" }) public static void main(String[] args) throws IOException { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX)); options.addOption(/*from w w w . ja v a 2s . c om*/ OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(COLLECTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(LookupPostingsCompressed1.class.getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX); String collectionPath = cmdline.getOptionValue(COLLECTION); if (collectionPath.endsWith(".gz")) { System.out.println("gzipped collection is not seekable: use compressed version!"); System.exit(-1); } Configuration config = new Configuration(); FileSystem fs = FileSystem.get(config); MapFile.Reader reader = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), config); FSDataInputStream collection = fs.open(new Path(collectionPath)); BufferedReader d = new BufferedReader(new InputStreamReader(collection)); Text key = new Text(); PairOfWritables<VIntWritable, ArrayListWritable<PairOfVInts>> value = new PairOfWritables<VIntWritable, ArrayListWritable<PairOfVInts>>(); System.out.println("Looking up postings for the term \"starcross'd\""); key.set("starcross'd"); reader.get(key, value); ArrayListWritable<PairOfVInts> postings = value.getRightElement(); for (PairOfVInts pair : postings) { System.out.println(pair); collection.seek(pair.getLeftElement()); System.out.println(d.readLine()); } key.set("gold"); reader.get(key, value); System.out.println("Complete postings list for 'gold': " + value); Int2IntFrequencyDistribution goldHist = new Int2IntFrequencyDistributionEntry(); postings = value.getRightElement(); for (PairOfVInts pair : postings) { goldHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for gold"); for (PairOfInts pair : goldHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } key.set("silver"); reader.get(key, value); System.out.println("Complete postings list for 'silver': " + value); Int2IntFrequencyDistribution silverHist = new Int2IntFrequencyDistributionEntry(); postings = value.getRightElement(); for (PairOfVInts pair : postings) { silverHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for silver"); for (PairOfInts pair : silverHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } key.set("bronze"); Writable w = reader.get(key, value); if (w == null) { System.out.println("the term bronze does not appear in the collection"); } collection.close(); reader.close(); }
From source file:HiveKeyIgnoringBAMOutputFormat.java
License:Open Source License
private void setSAMHeaderFrom(JobConf job) throws IOException { if (wrappedOutputFormat.getSAMHeader() != null) return;//from ww w . j a v a2 s .c o m // XXX: We're not told where to take the SAM header from so we just merge // them all. There should probably be a better way of doing this. final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(); // The "best" sort order among the headers: unsorted if they're sorted // differently, otherwise their common sort order. SAMFileHeader.SortOrder sortOrder = null; // XXX: it seems that FileInputFormat.getInputPaths(job) will point to // the directories of the input tables in the query. I'm not sure if this // is always the case. for (final Path table : FileInputFormat.getInputPaths(job)) { final FileSystem fs = table.getFileSystem(job); for (final FileStatus stat : fs.listStatus(table)) { if (!stat.isFile()) throw new IOException("Unexpected directory '" + stat.getPath() + "', expected only files"); final SAMFileReader r = new SAMFileReader(fs.open(stat.getPath())); final SAMFileHeader h = r.getFileHeader(); r.close(); headers.add(h); if (sortOrder == null) { sortOrder = h.getSortOrder(); continue; } if (sortOrder == SAMFileHeader.SortOrder.unsorted) continue; if (sortOrder != h.getSortOrder()) sortOrder = SAMFileHeader.SortOrder.unsorted; } } wrappedOutputFormat.setSAMHeader(new SamFileHeaderMerger(sortOrder, headers, true).getMergedHeader()); }
From source file:BytesBloomFilter.java
License:Apache License
public static BytesBloomFilter readFromFileSystem(FileSystem fs, Path p) throws IOException { BytesBloomFilter ret = new BytesBloomFilter(); FSDataInputStream is = fs.open(p); ret.readFields(is);/*from w w w .ja va 2s . c om*/ is.close(); return ret; }
From source file:TestDistinct.java
License:Apache License
public void testDistinct() throws IOException { FileSystem fs = FileSystem.get(new Configuration()); fs.delete(new Path("/tmp/test_distinct_file"), true); fs.delete(new Path("/tmp/test_distinct_file_results"), true); FSDataOutputStream out = fs.create(new Path("/tmp/test_distinct_file")); PrintWriter pw = new PrintWriter(out); pw.println("distinct1"); pw.println("distinct2"); pw.println("distinct2"); pw.println("distinct3"); pw.println("distinct2"); pw.flush();//from ww w. ja va2 s . co m out.close(); Map<String, Tap> sources = new HashMap<String, Tap>(); Map<String, Tap> sinks = new HashMap<String, Tap>(); Tap inTap = new Hfs(new TextLine(new Fields("line")), "/tmp/test_distinct_file"); Pipe inPipe = new Pipe("inPipe"); sources.put("inPipe", inTap); Distinct distinct = new Distinct(inPipe); Tap outTap = new Hfs(new TextLine(new Fields("line")), "/tmp/test_distinct_file_results"); Pipe outPipe = new Pipe("outPipe", distinct); sinks.put("outPipe", outTap); Flow flow = new FlowConnector().connect(sources, sinks, inPipe, outPipe); flow.complete(); FSDataInputStream in = fs.open(new Path("/tmp/test_distinct_file_results/part-00000")); BufferedReader reader = new BufferedReader(new InputStreamReader(in)); ArrayList<String> results = new ArrayList<String>(); results.add("distinct1"); results.add("distinct2"); results.add("distinct3"); try { while (true) { String s = reader.readLine(); if (s == null) { break; } assertEquals(results.remove(0), s); } } catch (Exception e) { fail("Got an exception while trying to verify the results: " + e.toString()); } assertEquals("All results must be consumed!", 0, results.size()); }
From source file:ReadAllTest.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.out.println("ReadAllTest: must supply the HDFS uri and file to read"); System.exit(1);// w ww. jav a2 s .c o m } String hdfsUri = args[0]; String fileName = args[1]; final Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(new URI(hdfsUri), conf); byte ORIGINAL[] = new byte[10]; for (int i = 0; i < ORIGINAL.length; i++) { ORIGINAL[i] = (byte) i; } FSDataOutputStream out = fs.create(new Path(fileName), (short) 1); try { out.write(ORIGINAL); } finally { out.close(); } byte input[] = new byte[ORIGINAL.length]; FSDataInputStream in = fs.open(new Path(fileName)); try { in.readFully(input); } finally { in.close(); } in = fs.open(new Path(fileName)); try { in.readFully(0, input); } finally { in.close(); } }
From source file:LookupPostings.java
License:Apache License
/** * Runs this tool./* w ww. j a v a 2s .c om*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(COLLECTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(LookupPostings.class.getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX); String collectionPath = cmdline.getOptionValue(COLLECTION); if (collectionPath.endsWith(".gz")) { System.out.println("gzipped collection is not seekable: use compressed version!"); System.exit(-1); } Configuration config = new Configuration(); FileSystem fs = FileSystem.get(config); MapFile.Reader reader = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), config); FSDataInputStream collection = fs.open(new Path(collectionPath)); BufferedReader d = new BufferedReader(new InputStreamReader(collection)); Text key = new Text(); PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>> value = new PairOfWritables<IntWritable, ArrayListWritable<PairOfInts>>(); System.out.println("Looking up postings for the term \"starcross'd\""); key.set("starcross'd"); reader.get(key, value); ArrayListWritable<PairOfInts> postings = value.getRightElement(); for (PairOfInts pair : postings) { System.out.println(pair); collection.seek(pair.getLeftElement()); System.out.println(d.readLine()); } key.set("gold"); reader.get(key, value); System.out.println("Complete postings list for 'gold': " + value); Int2IntFrequencyDistribution goldHist = new Int2IntFrequencyDistributionEntry(); postings = value.getRightElement(); for (PairOfInts pair : postings) { goldHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for gold"); for (PairOfInts pair : goldHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } key.set("silver"); reader.get(key, value); System.out.println("Complete postings list for 'silver': " + value); Int2IntFrequencyDistribution silverHist = new Int2IntFrequencyDistributionEntry(); postings = value.getRightElement(); for (PairOfInts pair : postings) { silverHist.increment(pair.getRightElement()); } System.out.println("histogram of tf values for silver"); for (PairOfInts pair : silverHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } key.set("bronze"); Writable w = reader.get(key, value); if (w == null) { System.out.println("the term bronze does not appear in the collection"); } collection.close(); reader.close(); return 0; }