List of usage examples for org.apache.hadoop.fs Path Path
public Path(URI aUri)
From source file:FormatStorageBasicTest.java
License:Open Source License
public void testOpenNoRecord() { try {//from w w w . j a v a 2 s. c o m String fileName = prefix + "testOpenNoRecord"; Head head = new Head(); FormatDataFile fd = new FormatDataFile(new Configuration()); fd.create(fileName, head); fd.close(); FileSystem fs = FileSystem.get(new Configuration()); long fileLen = fs.getFileStatus(new Path(fileName)).getLen(); if (fileLen != head.len() + ConstVar.IndexMetaOffset) { fail("error file len:" + fileLen); } FormatDataFile fd2 = new FormatDataFile(new Configuration()); fd2.open(fileName); if (fd2.recordNum() != 0) { fail("error record num:" + fd2.recordNum()); } if (fd2.segmentNum() != 0) { fail("error segment num:" + fd2.segmentNum()); } } catch (Exception e) { e.printStackTrace(); fail("get exception:" + e.getMessage()); } }
From source file:WikipediaForwardIndexBuilder.java
License:Apache License
@SuppressWarnings("static-access") @Override/*w ww. ja va 2 s .c o m*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() .withDescription("two-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION)); String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION); String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000); if (!inputPath.isAbsolute()) { System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class); FileSystem fs = FileSystem.get(conf); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input path: " + inputPath); LOG.info(" - index file: " + indexFile); LOG.info(" - language: " + language); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language)); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(tmpPath)); FileOutputFormat.setCompressOutput(conf, false); if (language != null) { conf.set("wiki.language", language); } conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.getCounter(Blocks.Total); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName()); out.writeUTF(inputPath.toString()); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } // Clean up. fs.delete(new Path(tmpPath), true); return 0; }
From source file:AvgScore.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: AvgScore <in> [<in>...] <out>"); System.exit(2);/* w w w .j a v a 2 s. com*/ } Job job = new Job(conf, "AvgScore"); job.setJarByClass(AvgScore.class); job.setMapperClass(Map.class); //job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:LobFilePerfTest.java
License:Apache License
public LobFilePerfTest() { conf = new Configuration(); conf.set("fs.default.name", "file:///"); p = new Path("foo.lob"); }
From source file:HBaseBloomFilterSemiJoinSystemTest.java
License:Apache License
@Test public void testBloomFilterSemiJoinDirectly() throws IOException, InterruptedException, NoSuchFieldException, IllegalAccessException { NavigableMap<ByteBuffer, ListMultimap<ByteBuffer, BloomFilter>> regionIndex = NonAggregatingRegionObserver .buildIndex("test_table", util.getConfiguration(), util.getTestFileSystem(), new Path(util.getDefaultRootDirPath() + Path.SEPARATOR + "test_table")); assertSame("Unexpected number of regions.", 3, regionIndex.size()); NavigableMap<HRegionInfo, ServerName> regions = table.getRegionLocations(); for (Map.Entry<HRegionInfo, ServerName> entry : regions.entrySet()) { LOG.info("Using Region: " + entry.getKey() + " Server: " + entry.getValue()); }// w w w .j av a 2s . co m for (Map.Entry<ByteBuffer, ListMultimap<ByteBuffer, BloomFilter>> entry : regionIndex.entrySet()) { assertSame("Unexpected number of HFiles.", 1, entry.getValue().get(ByteBuffer.wrap(ROW_COLBF_CF)).size()); } CompoundBloomFilterBase bfEntryCreator = new CompoundBloomFilterBase(); double falsePositivesCounter = 0.0; for (int i = 0; i < NUM_ROWS; i++) { byte[] key1 = toBytes("aaa" + i); byte[] key2 = toBytes("bbb" + i); byte[] key3 = toBytes("ccc" + i); assertNotNull("Could not find a region for key: " + new String(key2)); // creates bbbXaaaX bf entry keys that must match region0 [bbb0, bbb999] ROW_COL BF [bbb0aaa0, bbb999aaa999] byte[] bfMatchKey = bfEntryCreator.createBloomKey(key2, 0, key2.length, key1, 0, key1.length); BloomFilter bfMatch = bloomFilterForRowCol(regionIndex, key2); // bloom filters don't return false positives assertTrue("Unexpected result from the bloom filter: " + new String(bfMatchKey), bfMatch.contains(bfMatchKey, 0, bfMatchKey.length, null)); // creates bbbXcccX bf entry keys that don't exist in region2 [bbb0, bbb999] ROW_COL BF [bbb0aaa0, bbb999aaa999] // but will match the index and therefore may provide false positives byte[] bfNoMatchKeyFalsePositives = bfEntryCreator.createBloomKey(key2, 0, key2.length, key3, 0, key3.length); BloomFilter bfNoMatchFalsePositives = bloomFilterForRowCol(regionIndex, key2); if (bfNoMatchFalsePositives.contains(bfNoMatchKeyFalsePositives, 0, bfNoMatchKeyFalsePositives.length, null)) { falsePositivesCounter++; } if (i <= NUM_ROWS / 2 && Integer.parseInt((i + "").charAt(0) + "") < 5) { // creates cccXaaaX bf entry keys that don't exist in region3 [ccc0, ccc999] ROW_COL BF [ccc0bbb0, ccc999bbb999] // but won't match the index (and therefore won't provide false positives) byte[] bfNoMatchKeyNoFalsePositives = bfEntryCreator.createBloomKey(key3, 0, key3.length, key1, 0, key1.length); BloomFilter bfNoMatchNoFalsePositives = bloomFilterForRowCol(regionIndex, key3); assertFalse("Unexpected result from the bloom filter: " + new String(bfNoMatchKeyNoFalsePositives), bfNoMatchNoFalsePositives.contains(bfNoMatchKeyNoFalsePositives, 0, bfNoMatchKeyNoFalsePositives.length, null)); } } double falsePositiveRate = falsePositivesCounter / NUM_ROWS; LOG.info("False positive Rate: {} ", falsePositiveRate); assertTrue("Unexpectedly high percentage of false positives: " + falsePositiveRate, falsePositiveRate < 0.1); }
From source file:RunPageRankSchimmy.java
License:Apache License
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws Exception { Configuration conf = getConf(); String in = path + "/iter" + FORMAT.format(i); String out = path + "/iter" + FORMAT.format(j) + "t"; String outm = out + "-mass"; FileSystem fs = FileSystem.get(conf); // We need to actually count the number of part files to get the number // of partitions (because the directory might contain _log). int numPartitions = 0; for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) { numPartitions++;//from www. j a v a2s.com } } conf.setInt("NodeCount", n); Partitioner<IntWritable, Writable> p = null; if (useRange) { p = new RangePartitioner(); ((Configurable) p).setConf(conf); } else { p = new HashPartitioner<IntWritable, Writable>(); } // This is really annoying: the mapping between the partition numbers on // disk (i.e., part-XXXX) and what partition the file contains (i.e., // key.hash % #reducer) is arbitrary... so this means that we need to // open up each partition, peek inside to find out. IntWritable key = new IntWritable(); PageRankNode value = new PageRankNode(); FileStatus[] status = fs.listStatus(new Path(in)); StringBuilder sb = new StringBuilder(); for (FileStatus f : status) { if (!f.getPath().getName().contains("part-")) { continue; } SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath())); reader.next(key, value); int np = p.getPartition(key, value, numPartitions); reader.close(); LOG.info(f.getPath() + "\t" + np); sb.append(np + "=" + f.getPath() + ";"); } LOG.info(sb.toString().trim()); LOG.info("PageRankSchimmy: iteration " + j + ": Phase1"); LOG.info(" - input: " + in); LOG.info(" - output: " + out); LOG.info(" - nodeCnt: " + n); LOG.info(" - useCombiner: " + useCombiner); LOG.info(" - useInmapCombiner: " + useInmapCombiner); LOG.info(" - numPartitions: " + numPartitions); LOG.info(" - useRange: " + useRange); LOG.info("computed number of partitions: " + numPartitions); int numReduceTasks = numPartitions; conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); //conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("PageRankMassPath", outm); conf.set("BasePath", in); conf.set("PartitionMapping", sb.toString().trim()); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); Job job = Job.getInstance(conf); job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1"); job.setJarByClass(RunPageRankSchimmy.class); job.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(FloatWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); if (useInmapCombiner) { job.setMapperClass(MapWithInMapperCombiningClass.class); } else { job.setMapperClass(MapClass.class); } if (useCombiner) { job.setCombinerClass(CombineClass.class); } if (useRange) { job.setPartitionerClass(RangePartitioner.class); } job.setReducerClass(ReduceClass.class); FileSystem.get(conf).delete(new Path(out), true); FileSystem.get(conf).delete(new Path(outm), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); float mass = Float.NEGATIVE_INFINITY; for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:RunPageRankSchimmy.java
License:Apache License
private void phase2(String path, int i, int j, int n, float missing) throws Exception { Configuration conf = getConf(); LOG.info("missing PageRank mass: " + missing); LOG.info("number of nodes: " + n); String in = path + "/iter" + FORMAT.format(j) + "t"; String out = path + "/iter" + FORMAT.format(j); LOG.info("PageRankSchimmy: iteration " + j + ": Phase2"); LOG.info(" - input: " + in); LOG.info(" - output: " + out); Job job = Job.getInstance(conf);/*from www .j a v a 2s . c o m*/ job.setJobName("PageRankSchimmy:iteration" + j + ":Phase2"); job.setJarByClass(RunPageRankSchimmy.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); job.setMapperClass(MapPageRankMassDistributionClass.class); conf.setFloat("MissingMass", (float) missing); conf.setInt("NodeCount", n); FileSystem.get(conf).delete(new Path(out), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); }
From source file:TaskSearchWords.java
public static void main(String[] args) throws Exception { String hadoopServer = "ip-172-31-13-245.ap-southeast-1.compute.internal"; Configuration conf = new Configuration(); // this should be like defined in your mapred-site.xml conf.set("mapred.job.tracker", hadoopServer + ":54311"); // like defined in hdfs-site.xml conf.set("fs.default.name", "hdfs://" + hadoopServer + ":9000"); //setting mapred classes for HDFS to know which classes to process conf.set("mapreduce.map.class", "TokenizerMapper"); conf.set("mapreduce.reduce.class", "IntSumReducer"); //to prevent classdefnotfound exception conf.set("mapred.jar", "C:\\GitRepos\\OCR\\HadoopTasks\\dist\\HadoopTasks.jar"); //to pass parameters to mapred classes conf.set("RAWOCRCLOB", "Omeprazole_Cap E/C 10mg\n" + "Dressit Ster esDress\n" + "Flaminal Forte 15g\n" + "Co-Magaldrox_Susp 195mg/220mg/5ml S/F\n" + "Antacid/Oxetacaine_Oral Susp S/F\n" + "Simeticone_Susp 40mg/ml S/F\n" + "Infacol_Susp 40mg/ml S/F"); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(TaskSearchWords.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path("/user/ubuntu/MedicinesProcessed.csv")); FileSystem fs = FileSystem.get(conf); Path out = new Path("/user/ubuntu/processed/"); fs.delete(out, true);// ww w . ja v a2s .co m //finally set the empty out path FileOutputFormat.setOutputPath(job, out); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:PartitionGraph.java
License:Apache License
/** * Runs this tool./*from ww w.j a va 2 s . c om*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(RANGE, "use range partitioner")); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of partitions") .create(NUM_PARTITIONS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES) || !cmdline.hasOption(NUM_PARTITIONS)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inPath = cmdline.getOptionValue(INPUT); String outPath = cmdline.getOptionValue(OUTPUT); int nodeCount = Integer.parseInt(cmdline.getOptionValue(NUM_NODES)); int numParts = Integer.parseInt(cmdline.getOptionValue(NUM_PARTITIONS)); boolean useRange = cmdline.hasOption(RANGE); LOG.info("Tool name: " + PartitionGraph.class.getSimpleName()); LOG.info(" - input dir: " + inPath); LOG.info(" - output dir: " + outPath); LOG.info(" - num partitions: " + numParts); LOG.info(" - node cnt: " + nodeCount); LOG.info(" - use range partitioner: " + useRange); Configuration conf = getConf(); conf.setInt("NodeCount", nodeCount); Job job = Job.getInstance(conf); job.setJobName(PartitionGraph.class.getSimpleName() + ":" + inPath); job.setJarByClass(PartitionGraph.class); job.setNumReduceTasks(numParts); FileInputFormat.setInputPaths(job, new Path(inPath)); FileOutputFormat.setOutputPath(job, new Path(outPath)); job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNodeMultiSrc.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNodeMultiSrc.class); if (useRange) { job.setPartitionerClass(RangePartitioner.class); } FileSystem.get(conf).delete(new Path(outPath), true); job.waitForCompletion(true); return 0; }
From source file:HadoopUtilsTest.java
License:Apache License
public static void main(String[] args) throws IOException { Configuration confgiruration = HBaseConfiguration.create(); FileSystem fileSystem = null; try {//from w w w . ja v a2 s . c o m fileSystem = FileSystem.get(confgiruration); FileStatus[] fileStatuses = fileSystem.listStatus(new Path("/icntv/grade/correlate-result/2013-12-12"), new PathFilter() { @Override public boolean accept(Path path) { return path.getName().matches("part-r-\\d*"); } }); for (FileStatus f : fileStatuses) { IOUtils.copyBytes(fileSystem.open(f.getPath()), System.out, 4096, false); } } catch (Exception e) { e.printStackTrace(); } finally { if (null != fileSystem) { fileSystem.close(); } } }