List of usage examples for org.apache.hadoop.fs FSDataInputStream read
@Override public int read(ByteBuffer buf) throws IOException
From source file:top10_categories.Top10_Categories.java
/** * @param args the command line arguments *//*from ww w . j a va 2s. co m*/ public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf1 = new Configuration(); Configuration conf = new Configuration(); Path inputDir = new Path(args[0]); Path hdfsFile = new Path(args[1]); FileSystem hdfs = FileSystem.get(conf); FileSystem local = FileSystem.getLocal(conf); try { FileStatus[] inputFiles = local.listStatus(inputDir); FSDataOutputStream out = hdfs.create(hdfsFile); for (int i = 0; i < inputFiles.length; i++) { System.out.println(inputFiles[i].getPath().getName()); FSDataInputStream in = local.open(inputFiles[i].getPath()); byte[] buffer = new byte[256]; int bytesRead = 0; while ((bytesRead = in.read(buffer)) > 0) { out.write(buffer, 0, bytesRead); } in.close(); } out.close(); } catch (IOException e) { e.printStackTrace(); } Job job1 = Job.getInstance(conf1, "Chaining"); job1.setJarByClass(Top10_Categories.class); job1.setMapperClass(Map1.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(FloatWritable.class); job1.setReducerClass(Reduce1.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(DoubleWritable.class); job1.setCombinerClass(Reduce1.class); FileInputFormat.addInputPath(job1, hdfsFile); FileOutputFormat.setOutputPath(job1, new Path(args[2])); boolean complete = job1.waitForCompletion(true); Configuration conf2 = new Configuration(); Job job2 = Job.getInstance(conf2, "Chaining"); if (complete) { job2.setJarByClass(Top10_Categories.class); job2.setMapperClass(Map2.class); job2.setMapOutputKeyClass(FloatWritable.class); job2.setMapOutputValueClass(Text.class); job2.setReducerClass(Reduce2.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(FloatWritable.class); job2.setSortComparatorClass(SortKeyComparator.class); job2.setNumReduceTasks(1); FileInputFormat.addInputPath(job2, new Path(args[2])); FileOutputFormat.setOutputPath(job2, new Path(args[3])); System.exit(job2.waitForCompletion(true) ? 0 : 1); } }
From source file:voldemort.store.readonly.fetcher.HdfsFetcher.java
License:Apache License
private void copyFileWithCheckSum(FileSystem fs, Path source, File dest, CopyStats stats, CheckSum fileCheckSumGenerator) throws IOException { logger.debug("Starting copy of " + source + " to " + dest); FSDataInputStream input = null; OutputStream output = null;/* w ww . ja v a2s. com*/ for (int attempt = 0; attempt < NUM_RETRIES; attempt++) { boolean success = true; try { input = fs.open(source); output = new BufferedOutputStream(new FileOutputStream(dest)); byte[] buffer = new byte[bufferSize]; while (true) { int read = input.read(buffer); if (read < 0) { break; } else { output.write(buffer, 0, read); } if (fileCheckSumGenerator != null) fileCheckSumGenerator.update(buffer, 0, read); if (throttler != null) throttler.maybeThrottle(read); stats.recordBytes(read); if (stats.getBytesSinceLastReport() > reportingIntervalBytes) { NumberFormat format = NumberFormat.getNumberInstance(); format.setMaximumFractionDigits(2); logger.info(stats.getTotalBytesCopied() / (1024 * 1024) + " MB copied at " + format.format(stats.getBytesPerSecond() / (1024 * 1024)) + " MB/sec - " + format.format(stats.getPercentCopied()) + " % complete, destination:" + dest); if (this.status != null) { this.status.setStatus(stats.getTotalBytesCopied() / (1024 * 1024) + " MB copied at " + format.format(stats.getBytesPerSecond() / (1024 * 1024)) + " MB/sec - " + format.format(stats.getPercentCopied()) + " % complete, destination:" + dest); } stats.reset(); } } logger.info("Completed copy of " + source + " to " + dest); } catch (IOException ioe) { success = false; logger.error("Error during copying file ", ioe); ioe.printStackTrace(); if (attempt < NUM_RETRIES - 1) { logger.info("retrying copying"); } else { throw ioe; } } finally { IOUtils.closeQuietly(output); IOUtils.closeQuietly(input); if (success) { break; } } logger.debug("Completed copy of " + source + " to " + dest); } }
From source file:voldemort.store.readonly.fetcher.HdfsFetcherAdvancedTest.java
License:Apache License
private byte[] calculateCheckSumForFile(Path source) throws Exception { CheckSum fileCheckSumGenerator = CheckSum.getInstance(CheckSumType.MD5); byte[] buffer = new byte[VoldemortConfig.DEFAULT_FETCHER_BUFFER_SIZE]; FSDataInputStream input = null; Configuration config = new Configuration(); FileSystem fs = source.getFileSystem(config); input = fs.open(source);//from w ww . j av a2 s . c o m while (true) { int read = input.read(buffer); if (read < 0) { break; } // Update the per file checksum if (fileCheckSumGenerator != null) { fileCheckSumGenerator.update(buffer, 0, read); } } return fileCheckSumGenerator.getCheckSum(); }
From source file:voldemort.store.readonly.mapreduce.HadoopStoreBuilder.java
License:Apache License
/** * Run the job//w w w . j a va2 s . co m */ public void build() { try { Job job = new Job(config); job.getConfiguration().setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); job.getConfiguration().set("cluster.xml", new ClusterMapper().writeCluster(cluster)); job.getConfiguration().set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); job.getConfiguration().setBoolean("save.keys", saveKeys); job.getConfiguration().set("final.output.dir", outputDir.toString()); job.getConfiguration().set("checksum.type", CheckSum.toString(checkSumType)); job.setPartitionerClass(HadoopStoreBuilderPartitioner.class); job.setMapperClass(mapperClass); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); job.setReducerClass(HadoopStoreBuilderReducer.class); job.setInputFormatClass(inputFormatClass); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setJarByClass(getClass()); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, tempDir); FileSystem outputFs = outputDir.getFileSystem(job.getConfiguration()); if (outputFs.exists(outputDir)) { throw new IOException("Final output directory already exists."); } // delete output dir if it already exists FileSystem tempFs = tempDir.getFileSystem(job.getConfiguration()); tempFs.delete(tempDir, true); long size = sizeOfPath(tempFs, inputPath); int numChunks = Math.max( (int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfNodes() / chunkSizeBytes), 1); logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor() + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes + ", num.chunks = " + numChunks); job.getConfiguration().setInt("num.chunks", numChunks); int numReduces = cluster.getNumberOfNodes() * numChunks; job.setNumReduceTasks(numReduces); logger.info("Number of reduces: " + numReduces); logger.info("Building store..."); job.waitForCompletion(true); ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata(); if (saveKeys) metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode()); else metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode()); // Check if all folder exists and with format file for (Node node : cluster.getNodes()) { Path nodePath = new Path(outputDir.toString(), "node-" + node.getId()); if (!outputFs.exists(nodePath)) { outputFs.mkdirs(nodePath); // Create empty folder } // Write metadata FSDataOutputStream metadataStream = outputFs.create(new Path(nodePath, ".metadata")); metadataStream.write(metadata.toJsonString().getBytes()); metadataStream.flush(); metadataStream.close(); } if (checkSumType != CheckSumType.NONE) { // Generate checksum for every node FileStatus[] nodes = outputFs.listStatus(outputDir); // Do a CheckSumOfCheckSum - Similar to HDFS CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType); if (checkSumGenerator == null) { throw new VoldemortException("Could not generate checksum digests"); } for (FileStatus node : nodes) { if (node.isDir()) { FileStatus[] storeFiles = outputFs.listStatus(node.getPath(), new PathFilter() { public boolean accept(Path arg0) { if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) { return true; } return false; } }); if (storeFiles != null) { Arrays.sort(storeFiles, new IndexFileLastComparator()); for (FileStatus file : storeFiles) { FSDataInputStream input = outputFs.open(file.getPath()); byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)]; input.read(fileCheckSum); checkSumGenerator.update(fileCheckSum); outputFs.delete(file.getPath(), true); } FSDataOutputStream checkSumStream = outputFs.create( new Path(node.getPath(), CheckSum.toString(checkSumType) + "checkSum.txt")); checkSumStream.write(checkSumGenerator.getCheckSum()); checkSumStream.flush(); checkSumStream.close(); } } } } } catch (Exception e) { logger.error("Error = " + e); throw new VoldemortException(e); } }
From source file:voldemort.store.readonly.mr.HadoopStoreBuilder.java
License:Apache License
/** * Run the job/*www .j a v a 2 s.c o m*/ */ public void build() { try { JobConf conf = new JobConf(config); conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster)); conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); conf.setBoolean("save.keys", saveKeys); conf.setBoolean("reducer.per.bucket", reducerPerBucket); if (!isAvro) { conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class); conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(BytesWritable.class); conf.setMapOutputValueClass(BytesWritable.class); if (reducerPerBucket) { conf.setReducerClass(HadoopStoreBuilderReducerPerBucket.class); } else { conf.setReducerClass(HadoopStoreBuilderReducer.class); } } conf.setInputFormat(inputFormatClass); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setJarByClass(getClass()); conf.setReduceSpeculativeExecution(false); FileInputFormat.setInputPaths(conf, inputPath); conf.set("final.output.dir", outputDir.toString()); conf.set("checksum.type", CheckSum.toString(checkSumType)); FileOutputFormat.setOutputPath(conf, tempDir); FileSystem outputFs = outputDir.getFileSystem(conf); if (outputFs.exists(outputDir)) { throw new IOException("Final output directory already exists."); } // delete output dir if it already exists FileSystem tempFs = tempDir.getFileSystem(conf); tempFs.delete(tempDir, true); long size = sizeOfPath(tempFs, inputPath); logger.info("Data size = " + size + ", replication factor = " + storeDef.getReplicationFactor() + ", numNodes = " + cluster.getNumberOfNodes() + ", chunk size = " + chunkSizeBytes); // Derive "rough" number of chunks and reducers int numReducers; if (saveKeys) { if (this.numChunks == -1) { this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfPartitions() / storeDef.getReplicationFactor() / chunkSizeBytes), 1); } else { logger.info( "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly"); } if (reducerPerBucket) { numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor(); } else { numReducers = cluster.getNumberOfPartitions() * storeDef.getReplicationFactor() * numChunks; } } else { if (this.numChunks == -1) { this.numChunks = Math.max((int) (storeDef.getReplicationFactor() * size / cluster.getNumberOfPartitions() / chunkSizeBytes), 1); } else { logger.info( "Overriding chunk size byte and taking num chunks (" + this.numChunks + ") directly"); } if (reducerPerBucket) { numReducers = cluster.getNumberOfPartitions(); } else { numReducers = cluster.getNumberOfPartitions() * numChunks; } } conf.setInt("num.chunks", numChunks); conf.setNumReduceTasks(numReducers); if (isAvro) { conf.setPartitionerClass(AvroStoreBuilderPartitioner.class); // conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(ByteBuffer.class); conf.setMapOutputValueClass(ByteBuffer.class); conf.setInputFormat(inputFormatClass); conf.setOutputFormat((Class<? extends OutputFormat>) AvroOutputFormat.class); conf.setOutputKeyClass(ByteBuffer.class); conf.setOutputValueClass(ByteBuffer.class); // AvroJob confs for the avro mapper AvroJob.setInputSchema(conf, Schema.parse(config.get("avro.rec.schema"))); AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.BYTES), Schema.create(Schema.Type.BYTES))); AvroJob.setMapperClass(conf, mapperClass); if (reducerPerBucket) { conf.setReducerClass(AvroStoreBuilderReducerPerBucket.class); } else { conf.setReducerClass(AvroStoreBuilderReducer.class); } } logger.info("Number of chunks: " + numChunks + ", number of reducers: " + numReducers + ", save keys: " + saveKeys + ", reducerPerBucket: " + reducerPerBucket); logger.info("Building store..."); RunningJob job = JobClient.runJob(conf); // Once the job has completed log the counter Counters counters = job.getCounters(); if (saveKeys) { if (reducerPerBucket) { logger.info("Number of collisions in the job - " + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS)); logger.info("Maximum number of collisions for one entry - " + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS)); } else { logger.info("Number of collisions in the job - " + counters.getCounter(KeyValueWriter.CollisionCounter.NUM_COLLISIONS)); logger.info("Maximum number of collisions for one entry - " + counters.getCounter(KeyValueWriter.CollisionCounter.MAX_COLLISIONS)); } } // Do a CheckSumOfCheckSum - Similar to HDFS CheckSum checkSumGenerator = CheckSum.getInstance(this.checkSumType); if (!this.checkSumType.equals(CheckSumType.NONE) && checkSumGenerator == null) { throw new VoldemortException("Could not generate checksum digest for type " + this.checkSumType); } // Check if all folder exists and with format file for (Node node : cluster.getNodes()) { ReadOnlyStorageMetadata metadata = new ReadOnlyStorageMetadata(); if (saveKeys) { metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V2.getCode()); } else { metadata.add(ReadOnlyStorageMetadata.FORMAT, ReadOnlyStorageFormat.READONLY_V1.getCode()); } Path nodePath = new Path(outputDir.toString(), "node-" + node.getId()); if (!outputFs.exists(nodePath)) { logger.info("No data generated for node " + node.getId() + ". Generating empty folder"); outputFs.mkdirs(nodePath); // Create empty folder outputFs.setPermission(nodePath, new FsPermission(HADOOP_FILE_PERMISSION)); logger.info("Setting permission to 755 for " + nodePath); } if (checkSumType != CheckSumType.NONE) { FileStatus[] storeFiles = outputFs.listStatus(nodePath, new PathFilter() { public boolean accept(Path arg0) { if (arg0.getName().endsWith("checksum") && !arg0.getName().startsWith(".")) { return true; } return false; } }); if (storeFiles != null && storeFiles.length > 0) { Arrays.sort(storeFiles, new IndexFileLastComparator()); FSDataInputStream input = null; for (FileStatus file : storeFiles) { try { input = outputFs.open(file.getPath()); byte fileCheckSum[] = new byte[CheckSum.checkSumLength(this.checkSumType)]; input.read(fileCheckSum); logger.debug("Checksum for file " + file.toString() + " - " + new String(Hex.encodeHex(fileCheckSum))); checkSumGenerator.update(fileCheckSum); } catch (Exception e) { logger.error("Error while reading checksum file " + e.getMessage(), e); } finally { if (input != null) input.close(); } outputFs.delete(file.getPath(), false); } metadata.add(ReadOnlyStorageMetadata.CHECKSUM_TYPE, CheckSum.toString(checkSumType)); String checkSum = new String(Hex.encodeHex(checkSumGenerator.getCheckSum())); logger.info("Checksum for node " + node.getId() + " - " + checkSum); metadata.add(ReadOnlyStorageMetadata.CHECKSUM, checkSum); } } // Write metadata Path metadataPath = new Path(nodePath, ".metadata"); FSDataOutputStream metadataStream = outputFs.create(metadataPath); outputFs.setPermission(metadataPath, new FsPermission(HADOOP_FILE_PERMISSION)); logger.info("Setting permission to 755 for " + metadataPath); metadataStream.write(metadata.toJsonString().getBytes()); metadataStream.flush(); metadataStream.close(); } } catch (Exception e) { logger.error("Error in Store builder", e); throw new VoldemortException(e); } }
From source file:voldemort.store.readonly.mr.HadoopStoreBuilderUtils.java
License:Apache License
/** * Given a filesystem, path and buffer-size, read the file contents and * presents it as a string/*from w w w. j av a2 s. co m*/ * * @param fs Underlying filesystem * @param path The file to read * @param bufferSize The buffer size to use for reading * @return The contents of the file as a string * @throws IOException */ public static String readFileContents(FileSystem fs, Path path, int bufferSize) throws IOException { if (bufferSize <= 0) return new String(); FSDataInputStream input = fs.open(path); byte[] buffer = new byte[bufferSize]; ByteArrayOutputStream stream = new ByteArrayOutputStream(); while (true) { int read = input.read(buffer); if (read < 0) { break; } else { buffer = ByteUtils.copy(buffer, 0, read); } stream.write(buffer); } return new String(stream.toByteArray()); }