List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:FormatStorageBasicTest.java
License:Open Source License
public void testClose() { try {// ww w .ja v a 2 s .c om Head head = new Head(); head.setVar((byte) 1); Configuration conf = new Configuration(); FormatDataFile fd = new FormatDataFile(conf); fd.create(prefix + "testClose", head); int size = 100 * 10000; for (int i = 0; i < size; i++) { Record record = new Record(7); record.addValue(new FieldValue((byte) 1, (short) 0)); record.addValue(new FieldValue((short) 2, (short) 1)); record.addValue(new FieldValue((int) 3, (short) 2)); record.addValue(new FieldValue((long) 4, (short) 3)); record.addValue(new FieldValue((float) 5.5, (short) 4)); record.addValue(new FieldValue((double) 6.6, (short) 5)); record.addValue(new FieldValue("hello konten", (short) 6)); fd.addRecord(record); } if (fd.recordNum() != size) { fail("error record num:" + fd.recordNum()); } if (fd.currentSegment().currentUnit() == null) { fail("null current unit"); } if (fd.currentSegment() == null) { fail("null current seg"); } if (fd.segmentNum() != 0) { fail("error segment num:" + fd.segmentNum()); } int headLen = head.len(); long currentUnitLen = fd.currentSegment().currentUnit().len(); long segmentLen = fd.currentSegment().len() + currentUnitLen + ConstVar.LineIndexRecordLen; long remain = fd.currentSegment().remain(); int unitNum = fd.currentSegment().unitNum(); fd.close(); int indexLen = ConstVar.LineIndexRecordLen * fd.segmentNum(); int metaLen = ConstVar.IndexMetaOffset; long fileLen = fd.getFileLen(); if (fileLen != headLen + segmentLen + indexLen + metaLen) { fail("error file len:" + fileLen); } if (fd.in() != null) { fail("in should set null"); } if (fd.out() != null) { fail("out should set null"); } if (fd.recordNum() != 0) { fail("record num should set 0"); } if (fd.keyIndexOffset != -1) { fail("key index offset not -1"); } if (fd.lineIndexOffset != -1) { fail("line index offset not -1"); } if (fd.currentOffset != -1) { fail("current offset not -1"); } if (fd.hasLoadAllSegmentDone) { fail("has load all segment Done not false"); } String fileName = prefix + "testClose"; Path path = new Path(fileName); FileSystem fs = FileSystem.get(new Configuration()); FSDataInputStream in = fs.open(path); long metaOffset = fileLen - ConstVar.IndexMetaOffset; in.seek(metaOffset); int recordNum = in.readInt(); int segNum = in.readInt(); long keyIndexOffset = in.readLong(); long lineIndexOffset = in.readLong(); if (recordNum != size) { fail("error record num:" + recordNum); } if (segNum != 1) { fail("error segNum:" + segNum); } if (keyIndexOffset != -1) { fail("error key index offset:" + keyIndexOffset); } if (lineIndexOffset != (headLen + segmentLen)) { fail("error line index offset:" + lineIndexOffset); } in.seek(lineIndexOffset); for (int i = 0; i < segNum; i++) { int beginLine = in.readInt(); int endLine = in.readInt(); long offset = in.readLong(); long len = in.readLong(); int idx = in.readInt(); if (beginLine != 0) { fail("error beginLine:" + beginLine); } if (endLine != size) { fail("error end line:" + endLine); } if (offset != head.len()) { fail("error offset:" + offset); } long tlen = size * full7chunkLen + size * 8 + ConstVar.DataChunkMetaOffset * (unitNum + 1) + 28 * (unitNum + 1) + 24; if (len != tlen) { fail("error len:" + len); } } } catch (IOException e) { e.printStackTrace(); fail("get ioexception:" + e.getMessage()); } catch (Exception e) { e.printStackTrace(); fail("get exception:" + e.getMessage()); } }
From source file:WikipediaForwardIndexBuilder.java
License:Apache License
@SuppressWarnings("static-access") @Override//from w w w . j a v a 2 s . c om public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() .withDescription("two-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION)); String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION); String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000); if (!inputPath.isAbsolute()) { System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class); FileSystem fs = FileSystem.get(conf); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input path: " + inputPath); LOG.info(" - index file: " + indexFile); LOG.info(" - language: " + language); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language)); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(tmpPath)); FileOutputFormat.setCompressOutput(conf, false); if (language != null) { conf.set("wiki.language", language); } conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.getCounter(Blocks.Total); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName()); out.writeUTF(inputPath.toString()); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } // Clean up. fs.delete(new Path(tmpPath), true); return 0; }
From source file:RunPageRankSchimmy.java
License:Apache License
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws Exception { Configuration conf = getConf(); String in = path + "/iter" + FORMAT.format(i); String out = path + "/iter" + FORMAT.format(j) + "t"; String outm = out + "-mass"; FileSystem fs = FileSystem.get(conf); // We need to actually count the number of part files to get the number // of partitions (because the directory might contain _log). int numPartitions = 0; for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) { numPartitions++;//from w w w .jav a2 s. c o m } } conf.setInt("NodeCount", n); Partitioner<IntWritable, Writable> p = null; if (useRange) { p = new RangePartitioner(); ((Configurable) p).setConf(conf); } else { p = new HashPartitioner<IntWritable, Writable>(); } // This is really annoying: the mapping between the partition numbers on // disk (i.e., part-XXXX) and what partition the file contains (i.e., // key.hash % #reducer) is arbitrary... so this means that we need to // open up each partition, peek inside to find out. IntWritable key = new IntWritable(); PageRankNode value = new PageRankNode(); FileStatus[] status = fs.listStatus(new Path(in)); StringBuilder sb = new StringBuilder(); for (FileStatus f : status) { if (!f.getPath().getName().contains("part-")) { continue; } SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath())); reader.next(key, value); int np = p.getPartition(key, value, numPartitions); reader.close(); LOG.info(f.getPath() + "\t" + np); sb.append(np + "=" + f.getPath() + ";"); } LOG.info(sb.toString().trim()); LOG.info("PageRankSchimmy: iteration " + j + ": Phase1"); LOG.info(" - input: " + in); LOG.info(" - output: " + out); LOG.info(" - nodeCnt: " + n); LOG.info(" - useCombiner: " + useCombiner); LOG.info(" - useInmapCombiner: " + useInmapCombiner); LOG.info(" - numPartitions: " + numPartitions); LOG.info(" - useRange: " + useRange); LOG.info("computed number of partitions: " + numPartitions); int numReduceTasks = numPartitions; conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); //conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("PageRankMassPath", outm); conf.set("BasePath", in); conf.set("PartitionMapping", sb.toString().trim()); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); Job job = Job.getInstance(conf); job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1"); job.setJarByClass(RunPageRankSchimmy.class); job.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(job, new Path(in)); FileOutputFormat.setOutputPath(job, new Path(out)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(FloatWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); if (useInmapCombiner) { job.setMapperClass(MapWithInMapperCombiningClass.class); } else { job.setMapperClass(MapClass.class); } if (useCombiner) { job.setCombinerClass(CombineClass.class); } if (useRange) { job.setPartitionerClass(RangePartitioner.class); } job.setReducerClass(ReduceClass.class); FileSystem.get(conf).delete(new Path(out), true); FileSystem.get(conf).delete(new Path(outm), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); float mass = Float.NEGATIVE_INFINITY; for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:HadoopUtilsTest.java
License:Apache License
public static void main(String[] args) throws IOException { Configuration confgiruration = HBaseConfiguration.create(); FileSystem fileSystem = null; try {/*from w w w . j a va 2s . c om*/ fileSystem = FileSystem.get(confgiruration); FileStatus[] fileStatuses = fileSystem.listStatus(new Path("/icntv/grade/correlate-result/2013-12-12"), new PathFilter() { @Override public boolean accept(Path path) { return path.getName().matches("part-r-\\d*"); } }); for (FileStatus f : fileStatuses) { IOUtils.copyBytes(fileSystem.open(f.getPath()), System.out, 4096, false); } } catch (Exception e) { e.printStackTrace(); } finally { if (null != fileSystem) { fileSystem.close(); } } }
From source file:CountJob.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String msgs = ""; doJob("1", args, msgs); doJob("2", args, msgs); FileSystem hdfs = FileSystem.get(conf); BufferedReader bfr = new BufferedReader( new InputStreamReader(hdfs.open(new Path("/data/output/temp/12/part-r-00000")))); BufferedReader bfr2 = new BufferedReader( new InputStreamReader(hdfs.open(new Path("/data/output/temp/22/part-r-00000")))); Boolean same = true;/*from w ww . ja v a 2 s . c o m*/ String line1; String line2; line1 = bfr.readLine(); line2 = bfr2.readLine(); while (same == true) { if ((line1 == null && line2 != null) || (line1 != null && line2 == null)) { same = false; break; } else if ((line1 == null && line2 == null)) { break; } else { if (line1.equals(line2)) { line1 = bfr.readLine(); line2 = bfr2.readLine(); } else { same = false; break; } } } if (same == true) { System.out.print("same " + same + "\n"); Path localP = new Path("/tmp/output.txt"); hdfs.copyToLocalFile(new Path("/data/output/temp/12/part-r-00000"), localP); hdfs.copyFromLocalFile(localP, new Path(args[1] + "/part-r-00000")); hdfs.createNewFile(new Path(args[1] + "/_SUCCESS")); System.out.print("created result"); } else { System.out.print("Different"); doJob("3", args, msgs); Path localP = new Path("/tmp/output.txt"); hdfs.copyToLocalFile(new Path("/data/output/temp/32/part-r-00000"), localP); hdfs.copyFromLocalFile(localP, new Path(args[1] + "/part-r-00000")); hdfs.createNewFile(new Path(args[1] + "/_SUCCESS")); System.out.print("created result"); } hdfs.delete(new Path("/data/output/temp/12/part-r-00000"), true); hdfs.delete(new Path("/data/output/temp/22/part-r-00000"), true); }
From source file:Script.java
License:Open Source License
/** Evaluates the Javascript expressions contained in a * DataInputStream serialized file and passed over the distributed * cache.//from w w w . ja va2s . co m * @param conf The Hadoop configuration object * @param pathString The path string of the cached file * @param name The name of the file added to the cache * @return The result of the Javascript evaluation */ public Object evalCache(Configuration conf, String pathString, String name) throws IOException { FSDataInputStream in; FileSystem fs = FileSystem.getLocal(conf); try { Path path = new Path(pathString); in = fs.open(path); } catch (FileNotFoundException e) { // must be running in standalone mode Path path = new Path(Eggshell.SCRIPT_DIR + "/" + name); in = fs.open(path); // read it from the eggshell script directory instead } String buf = in.readUTF(); in.close(); return evalString(buf); }
From source file:LookupPostingsCompressed.java
License:Apache License
/** * Runs this tool.//from ww w . j av a 2 s . c om */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INDEX)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(COLLECTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX) || !cmdline.hasOption(COLLECTION)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(LookupPostingsCompressed.class.getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX); String collectionPath = cmdline.getOptionValue(COLLECTION); if (collectionPath.endsWith(".gz")) { System.out.println("gzipped collection is not seekable: use compressed version!"); System.exit(-1); } Configuration config = new Configuration(); FileSystem fs = FileSystem.get(config); MapFile.Reader reader = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), config); FSDataInputStream collection = fs.open(new Path(collectionPath)); BufferedReader d = new BufferedReader(new InputStreamReader(collection)); Text key = new Text(); PairOfWritables<VIntWritable, BytesWritable> value = new PairOfWritables<VIntWritable, BytesWritable>(); System.out.println("Looking up postings for the term \"starcross'd\""); key.set("starcross'd"); reader.get(key, value); BytesWritable postings = value.getRightElement(); ByteArrayInputStream buffer = new ByteArrayInputStream(postings.copyBytes()); DataInputStream in = new DataInputStream(buffer); int OFFSET = 0; int count; while (in.available() != 0) { OFFSET = OFFSET + WritableUtils.readVInt(in); count = WritableUtils.readVInt(in); System.out.print("(" + OFFSET + ", " + count + ")"); collection.seek(OFFSET); System.out.println(d.readLine()); } OFFSET = 0; key.set("gold"); reader.get(key, value); postings = value.getRightElement(); buffer = new ByteArrayInputStream(postings.copyBytes()); in = new DataInputStream(buffer); System.out.println("Complete postings list for 'gold': (" + value.getLeftElement() + ", ["); while (in.available() != 0) { OFFSET = OFFSET + WritableUtils.readVInt(in); count = WritableUtils.readVInt(in); System.out.print("(" + OFFSET + ", " + count + ")"); //collection.seek(OFFSET); //System.out.println(d.readLine()); System.out.print(", "); } System.out.print("])\n"); Int2IntFrequencyDistribution goldHist = new Int2IntFrequencyDistributionEntry(); buffer.reset(); OFFSET = 0; while (in.available() != 0) { OFFSET = OFFSET + WritableUtils.readVInt(in); count = WritableUtils.readVInt(in); goldHist.increment(count); } System.out.println("histogram of tf values for gold"); for (PairOfInts pair : goldHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } buffer.close(); //Silver key.set("silver"); reader.get(key, value); postings = value.getRightElement(); buffer = new ByteArrayInputStream(postings.copyBytes()); in = new DataInputStream(buffer); System.out.println("Complete postings list for 'silver': (" + value.getLeftElement() + ", ["); while (in.available() != 0) { OFFSET = OFFSET + WritableUtils.readVInt(in); count = WritableUtils.readVInt(in); System.out.print("(" + OFFSET + ", " + count + ")"); //collection.seek(OFFSET); //System.out.println(d.readLine()); System.out.print(", "); } System.out.print("])\n"); Int2IntFrequencyDistribution silverHist = new Int2IntFrequencyDistributionEntry(); buffer.reset(); OFFSET = 0; while (in.available() != 0) { OFFSET = OFFSET + WritableUtils.readVInt(in); count = WritableUtils.readVInt(in); silverHist.increment(count); } System.out.println("histogram of tf values for silver"); for (PairOfInts pair : goldHist) { System.out.println(pair.getLeftElement() + "\t" + pair.getRightElement()); } buffer.close(); key.set("bronze"); Writable w = reader.get(key, value); if (w == null) { System.out.println("the term bronze does not appear in the collection"); } collection.close(); reader.close(); return 0; }
From source file:FormatStoragePerformanceTest.java
License:Open Source License
static void doTextReadRand(int count) { try {//w w w.j av a 2 s .c o m String textFile = "MR_input_text/testPerformanceReadText"; Path path = new Path(textFile); FileSystem fs = FileSystem.get(new Configuration()); FSDataInputStream in = fs.open(path); InputStream stream = new BufferedInputStream(in); BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); long begin = System.currentTimeMillis(); count = 1 * 1000; for (int i = 0; i < count; i++) { int line = (int) (Math.random() * count); for (int j = 0; j < line; j++) { String value = reader.readLine(); value = null; } } reader.close(); long end = System.currentTimeMillis(); String string = "text read seq, count:" + count + ", delay:" + (long) ((end - begin) / 1000) + " s"; output.write(string.getBytes()); System.out.println(string); } catch (Exception e) { e.printStackTrace(); System.out.println(e.getMessage()); } }
From source file:FormatStoragePerformanceTest.java
License:Open Source License
static void doTextReadSeq(int count, boolean var) { try {/*from w w w .j a va 2 s . c o m*/ ArrayList<Integer> meta = new ArrayList<Integer>(10); for (int i = 0; i < 7; i++) { meta.add(i); } String textFile = "MR_input_text/testPerformanceReadText"; if (var) { textFile += "_var"; } Path path = new Path(textFile); FileSystem fs = FileSystem.get(new Configuration()); FSDataInputStream in = fs.open(path); InputStream stream = new BufferedInputStream(in); BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); long begin = System.currentTimeMillis(); for (int i = 0; i < count; i++) { String value = reader.readLine(); String[] fields = value.split(","); /* ByteArrayInputStream bin = new ByteArrayInputStream(value.getBytes()); meta.get(0); byte[] bb= new byte[4]; bin.read(bb); meta.get(1); byte[] sb= new byte[6]; bin.read(sb); meta.get(2); byte[] ib= new byte[9]; bin.read(ib); meta.get(3); byte[] lb= new byte[13]; bin.read(lb); meta.get(4); byte[] fb= new byte[13]; bin.read(fb); meta.get(5); byte[] db= new byte[18]; bin.read(db); meta.get(6); value = null; */ Byte.valueOf(fields[0]); Short.valueOf(fields[1]); Integer.valueOf(fields[2]); Long.valueOf(fields[3]); Float.valueOf(fields[4]); Double.valueOf(fields[5]); if (var) { String.valueOf(fields[6]); } } reader.close(); long end = System.currentTimeMillis(); String string = "text read seq " + count + " record over, delay: " + ((end - begin) / 1000) + " s \n"; System.out.println(string); } catch (Exception e) { e.printStackTrace(); System.out.println(e.getMessage()); } }
From source file:ColumnStoragePerformanceTest.java
License:Open Source License
static void doTextReadSeq(int count) { try {/* w w w. j a v a2 s. c o m*/ ArrayList<Integer> meta = new ArrayList<Integer>(10); for (int i = 0; i < 7; i++) { meta.add(i); } Path path = new Path(textFilename); FileSystem fs = FileSystem.get(new Configuration()); FSDataInputStream in = fs.open(path); InputStream stream = new BufferedInputStream(in); BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); long begin = System.currentTimeMillis(); for (int i = 0; i < count; i++) { String value = reader.readLine(); String[] fields = value.split(","); ByteArrayInputStream bin = new ByteArrayInputStream(value.getBytes()); meta.get(0); byte[] bb = new byte[4]; bin.read(bb); meta.get(1); byte[] sb = new byte[6]; bin.read(sb); meta.get(2); byte[] ib = new byte[9]; bin.read(ib); meta.get(3); byte[] lb = new byte[13]; bin.read(lb); meta.get(4); byte[] fb = new byte[13]; bin.read(fb); meta.get(5); byte[] db = new byte[18]; bin.read(db); meta.get(6); value = null; } reader.close(); long end = System.currentTimeMillis(); String string = "text read seq " + count + " record over, delay: " + ((end - begin) / 1000) + " s \n"; System.out.println(string); } catch (Exception e) { e.printStackTrace(); System.out.println(e.getMessage()); } }