List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:com.ci.backports.hadoop.hbase.ZHFileOutputFormat.java
License:Apache License
public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(final TaskAttemptContext context) throws IOException, InterruptedException { // Get the path of the temporary output file final Path outputPath = FileOutputFormat.getOutputPath(context); final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath(); Configuration conf = context.getConfiguration(); final FileSystem fs = outputdir.getFileSystem(conf); // These configs. are from hbase-*.xml final long maxsize = conf.getLong("hbase.hregion.max.filesize", HConstants.DEFAULT_MAX_FILE_SIZE); final int blocksize = conf.getInt("hfile.min.blocksize.size", HFile.DEFAULT_BLOCKSIZE); // Invented config. Add to hbase-*.xml if other than default compression. final String compression = conf.get("hfile.compression", Compression.Algorithm.NONE.getName()); return new RecordWriter<ImmutableBytesWritable, KeyValue>() { // Map of families to writers and how much has been output on the writer. private final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>( Bytes.BYTES_COMPARATOR); private byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY; private final byte[] now = Bytes.toBytes(System.currentTimeMillis()); private boolean rollRequested = false; public void write(ImmutableBytesWritable row, KeyValue kv) throws IOException { // null input == user explicitly wants to flush if (row == null && kv == null) { rollWriters();/*from www .j ava 2 s . co m*/ return; } byte[] rowKey = kv.getRow(); long length = kv.getLength(); byte[] family = kv.getFamily(); WriterLength wl = this.writers.get(family); // If this is a new column family, verify that the directory exists if (wl == null) { fs.mkdirs(new Path(outputdir, Bytes.toString(family))); } // If any of the HFiles for the column families has reached // maxsize, we need to roll all the writers if (wl != null && wl.written + length >= maxsize) { this.rollRequested = true; } // This can only happen once a row is finished though if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) { rollWriters(); } // create a new HLog writer, if necessary if (wl == null || wl.writer == null) { wl = getNewWriter(family); } // we now have the proper HLog writer. full steam ahead kv.updateLatestStamp(this.now); wl.writer.append(kv); wl.written += length; // Copy the row so we know when a row transition. this.previousRow = rowKey; } private void rollWriters() throws IOException { for (WriterLength wl : this.writers.values()) { if (wl.writer != null) { LOG.info("Writer=" + wl.writer.getPath() + ((wl.written == 0) ? "" : ", wrote=" + wl.written)); close(wl.writer); } wl.writer = null; wl.written = 0; } this.rollRequested = false; } /* Create a new HFile.Writer. * @param family * @return A WriterLength, containing a new HFile.Writer. * @throws IOException */ private WriterLength getNewWriter(byte[] family) throws IOException { WriterLength wl = new WriterLength(); Path familydir = new Path(outputdir, Bytes.toString(family)); wl.writer = new HFile.Writer(fs, StoreFile.getUniqueFile(fs, familydir), blocksize, compression, KeyValue.KEY_COMPARATOR); this.writers.put(family, wl); return wl; } private void close(final HFile.Writer w) throws IOException { if (w != null) { w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes.toBytes(System.currentTimeMillis())); w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes.toBytes(context.getTaskAttemptID().toString())); w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true)); w.close(); } } public void close(TaskAttemptContext c) throws IOException, InterruptedException { for (WriterLength wl : this.writers.values()) { close(wl.writer); } } }; }
From source file:com.ci.backports.hadoop.hbase.ZHFileOutputFormat.java
License:Apache License
/** * Configure a MapReduce Job to perform an incremental load into the given * table. This//from w ww . j a v a 2s.c o m * <ul> * <li>Inspects the table to configure a total order partitioner</li> * <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li> * <li>Sets the number of reduce tasks to match the current number of regions</li> * <li>Sets the output key/value class to match ZHFileOutputFormat's requirements</li> * <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or * ZPutSortReducer)</li> * </ul> * The user should be sure to set the map output value class to either KeyValue or Put before * running this function. */ public static void configureIncrementalLoad(Job job, HTable table) throws IOException { Configuration conf = job.getConfiguration(); job.setPartitionerClass(TotalOrderPartitioner.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); job.setOutputFormatClass(ZHFileOutputFormat.class); // Based on the configured map output class, set the correct reducer to properly // sort the incoming values. // TODO it would be nice to pick one or the other of these formats. if (KeyValue.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(KeyValueSortReducer.class); } else if (Put.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(ZPutSortReducer.class); } else { LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass()); } LOG.info("Looking up current regions for table " + table); List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table); LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count"); job.setNumReduceTasks(startKeys.size()); Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis()); LOG.info("Writing partition information to " + partitionsPath); FileSystem fs = partitionsPath.getFileSystem(conf); writePartitions(conf, partitionsPath, startKeys); partitionsPath.makeQualified(fs); URI cacheUri; try { cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH); } catch (URISyntaxException e) { throw new IOException(e); } DistributedCache.addCacheFile(cacheUri, conf); DistributedCache.createSymlink(conf); LOG.info("Incremental table output configured."); }
From source file:com.citic.zxyjs.zwlscx.mapreduce.lib.input.HFileOutputFormatBase.java
License:Apache License
public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(final TaskAttemptContext context) throws IOException, InterruptedException { // Get the path of the temporary output file final Path outputPath = FileOutputFormat.getOutputPath(context); final Path outputdir = new FileOutputCommitter(outputPath, context).getWorkPath(); final Path ignoreOutputPath = new Path(outputPath + "_ignore"); final Configuration conf = context.getConfiguration(); final FileSystem fs = outputdir.getFileSystem(conf); // These configs. are from hbase-*.xml final long maxsize = conf.getLong(HConstants.HREGION_MAX_FILESIZE, HConstants.DEFAULT_MAX_FILE_SIZE); // Invented config. Add to hbase-*.xml if other than default // compression. final String defaultCompression = conf.get("hfile.compression", Compression.Algorithm.NONE.getName()); final boolean compactionExclude = conf.getBoolean("hbase.mapreduce.hfileoutputformat.compaction.exclude", false);/*from w w w .j a v a 2 s. c o m*/ if (fs.exists(ignoreOutputPath)) { LOG.info("Deleted " + ignoreOutputPath.toString() + " success."); fs.delete(ignoreOutputPath, true); } // create a map from column family to the compression algorithm final Map<byte[], String> compressionMap = createFamilyCompressionMap(conf); final Map<byte[], String> bloomTypeMap = createFamilyBloomMap(conf); final Map<byte[], String> blockSizeMap = createFamilyBlockSizeMap(conf); String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_CONF_KEY); final HFileDataBlockEncoder encoder; if (dataBlockEncodingStr == null) { encoder = NoOpDataBlockEncoder.INSTANCE; } else { try { encoder = new HFileDataBlockEncoderImpl(DataBlockEncoding.valueOf(dataBlockEncodingStr)); } catch (IllegalArgumentException ex) { throw new RuntimeException("Invalid data block encoding type configured for the param " + DATABLOCK_ENCODING_CONF_KEY + " : " + dataBlockEncodingStr); } } return new RecordWriter<ImmutableBytesWritable, KeyValue>() { // Map of families to writers and how much has been output on the // writer. private final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>( Bytes.BYTES_COMPARATOR); private final FSDataOutputStream dos = fs.create(ignoreOutputPath); private byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY; private final byte[] now = Bytes.toBytes(System.currentTimeMillis()); private boolean rollRequested = false; public void write(ImmutableBytesWritable row, KeyValue kv) throws IOException { // null input == user explicitly wants to flush if (row == null && kv == null) { rollWriters(); return; } byte[] rowKey = kv.getRow(); long length = kv.getLength(); byte[] family = kv.getFamily(); if (ignore(kv)) { byte[] readBuf = rowKey; dos.write(readBuf, 0, readBuf.length); dos.write(Bytes.toBytes("\n")); return; } WriterLength wl = this.writers.get(family); // If this is a new column family, verify that the directory // exists if (wl == null) { Path path = null; path = new Path(outputdir, Bytes.toString(family)); fs.mkdirs(path); } // If any of the HFiles for the column families has reached // maxsize, we need to roll all the writers if (wl != null && wl.written + length >= maxsize) { this.rollRequested = true; } // This can only happen once a row is finished though if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) { rollWriters(); } // create a new HLog writer, if necessary if (wl == null || wl.writer == null) { wl = getNewWriter(family, conf); } // we now have the proper HLog writer. full steam ahead kv.updateLatestStamp(this.now); wl.writer.append(kv); wl.written += length; // Copy the row so we know when a row transition. this.previousRow = rowKey; } private void rollWriters() throws IOException { for (WriterLength wl : this.writers.values()) { if (wl.writer != null) { LOG.info("Writer=" + wl.writer.getPath() + ((wl.written == 0) ? "" : ", wrote=" + wl.written)); close(wl.writer); } wl.writer = null; wl.written = 0; } this.rollRequested = false; } /* * Create a new StoreFile.Writer. * @param family * @return A WriterLength, containing a new StoreFile.Writer. * @throws IOException */ private WriterLength getNewWriter(byte[] family, Configuration conf) throws IOException { WriterLength wl = new WriterLength(); Path familydir = new Path(outputdir, Bytes.toString(family)); String compression = compressionMap.get(family); compression = compression == null ? defaultCompression : compression; String bloomTypeStr = bloomTypeMap.get(family); BloomType bloomType = BloomType.NONE; if (bloomTypeStr != null) { bloomType = BloomType.valueOf(bloomTypeStr); } String blockSizeString = blockSizeMap.get(family); int blockSize = blockSizeString == null ? HConstants.DEFAULT_BLOCKSIZE : Integer.parseInt(blockSizeString); Configuration tempConf = new Configuration(conf); tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f); wl.writer = new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), fs, blockSize) .withOutputDir(familydir) .withCompression(AbstractHFileWriter.compressionByName(compression)) .withBloomType(bloomType).withComparator(KeyValue.COMPARATOR).withDataBlockEncoder(encoder) .withChecksumType(HStore.getChecksumType(conf)) .withBytesPerChecksum(HStore.getBytesPerChecksum(conf)).build(); this.writers.put(family, wl); return wl; } private void close(final StoreFile.Writer w) throws IOException { if (w != null) { w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes.toBytes(System.currentTimeMillis())); w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes.toBytes(context.getTaskAttemptID().toString())); w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true)); w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY, Bytes.toBytes(compactionExclude)); w.appendTrackedTimestampsToMetadata(); w.close(); } } public void close(TaskAttemptContext c) throws IOException, InterruptedException { dos.flush(); dos.close(); for (WriterLength wl : this.writers.values()) { close(wl.writer); } } }; }
From source file:com.citic.zxyjs.zwlscx.mapreduce.lib.input.HFileOutputFormatBase.java
License:Apache License
/** * Write out a {@link SequenceFile} that can be read by * {@link TotalOrderPartitioner} that contains the split points in * startKeys./*from w ww .j a v a 2 s . c o m*/ */ private static void writePartitions(Configuration conf, Path partitionsPath, List<ImmutableBytesWritable> startKeys) throws IOException { LOG.info("Writing partition information to " + partitionsPath); if (startKeys.isEmpty()) { throw new IllegalArgumentException("No regions passed"); } // We're generating a list of split points, and we don't ever // have keys < the first region (which has an empty start key) // so we need to remove it. Otherwise we would end up with an // empty reducer with index 0 TreeSet<ImmutableBytesWritable> sorted = new TreeSet<ImmutableBytesWritable>(startKeys); ImmutableBytesWritable first = sorted.first(); if (!first.equals(HConstants.EMPTY_BYTE_ARRAY)) { throw new IllegalArgumentException("First region of table should have empty start key. Instead has: " + Bytes.toStringBinary(first.get())); } sorted.remove(first); // Write the actual file FileSystem fs = partitionsPath.getFileSystem(conf); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, partitionsPath, ImmutableBytesWritable.class, NullWritable.class); try { for (ImmutableBytesWritable startKey : sorted) { writer.append(startKey, NullWritable.get()); } } finally { writer.close(); } }
From source file:com.ckelsel.hadoop.mapreduce.WordCount.WordCount.java
License:Open Source License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: EventCount <in> <out>"); System.exit(2);//w w w .j av a2 s .c o m } Job job = Job.getInstance(conf, "event count"); job.setJarByClass(WordCount.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); // delete output if exists Path outPath = new Path(otherArgs[1]); outPath.getFileSystem(conf).delete(outPath, true); FileOutputFormat.setOutputPath(job, outPath); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.ckelsel.hadoop.MaxTemperature.App.java
License:Open Source License
public static void main(String[] args) { if (args.length != 2) { System.err.println("Usage: MaxTemperature <input path> <output path>"); System.exit(-1);//from w w w. ja va 2s . c o m } System.out.println(args[0]); System.out.println(args[1]); try { Configuration conf = new Configuration(); conf.set("mapred.job.tracker", "localhost:9001"); Job job = Job.getInstance(conf); job.setJarByClass(App.class); job.setJobName("Max temperature"); FileInputFormat.addInputPath(job, new Path(args[0])); // delete output if exists Path outPath = new Path(args[1]); outPath.getFileSystem(conf).delete(outPath, true); FileOutputFormat.setOutputPath(job, outPath); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit(job.waitForCompletion(true) ? 0 : -1); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:com.cloudera.bigdata.analysis.dataload.mapreduce.SplitableRecordReader.java
License:Apache License
/** * Decide the start of the reader.//from w w w. jav a 2 s. co m */ public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // if (codec instanceof CryptoCodec && job instanceof JobConf) // CryptoContextHelper.resetInputCryptoContext((CryptoCodec) codec, // (JobConf) job, file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn), job); } else { in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } LOG.info("Read from " + split.getPath().toString()); // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); // Read another line as previous. Text current = new Text(); int newSize = in.readLine(previous, maxLineLength, maxBytesToConsume(start)); LOG.info("Skip line " + previous + " for last split."); start += newSize; // Keep reading until a splitable point is found. while (start <= end) { newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start)); if (canSplit(previous.getBytes(), current.getBytes())) { break; } start += newSize; previous.set(current.getBytes()); LOG.info("Skip line " + previous + " for last split."); } // If exceed the end, still read one extra line. if (start > end) { if (isContinue) { newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start)); if (!canSplit(previous.getBytes(), current.getBytes())) { // Still not splitable. So skip the block. start += newSize; isContinue = false; } } } LOG.info("Split between: \n" + previous + "\n" + current); // Restart at the last read line. fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } this.pos = start; } else { Text skip = new Text(); start += in.readLine(skip, maxLineLength, maxBytesToConsume(start)); // start += in.readLine(skip, 0, maxBytesToConsume(start)); LOG.info("Skip line " + skip + ". Start at " + start); } // Restart at the start index. }
From source file:com.cloudera.ByteBufferRecordReader.java
License:Apache License
private void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException { start = splitStart;/* w w w . j av a 2s. co m*/ end = start + splitLength; pos = start; // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); this.readStats = new ReadStatistics(); this.bufferPool = new ElasticByteBufferPool(); boolean skipChecksums = job.getBoolean("bytecount.skipChecksums", false); this.readOption = skipChecksums ? EnumSet.of(ReadOption.SKIP_CHECKSUMS) : EnumSet.noneOf(ReadOption.class); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor); filePosition = cIn; inputStream = cIn; LOG.info("Compressed input; cannot compute number of records in the split"); } else { fileIn.seek(start); filePosition = fileIn; inputStream = fileIn; LOG.info("Split pos = " + start + " length " + splitLength); } }
From source file:com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository.java
License:Apache License
private static FileSystem fsForPath(Path dataPath, Configuration conf) { try {/*ww w .j a v a 2s . c o m*/ return dataPath.getFileSystem(conf); } catch (IOException ex) { throw new DatasetRepositoryException("Cannot get FileSystem for descriptor", ex); } }
From source file:com.cloudera.cdk.data.filesystem.FileSystemMetadataProvider.java
License:Apache License
public FileSystemMetadataProvider(Configuration conf, Path rootDirectory) { Preconditions.checkArgument(conf != null, "Configuration cannot be null"); Preconditions.checkArgument(rootDirectory != null, "Root cannot be null"); this.conf = conf; try {//from w ww .j av a2s .co m this.rootFileSystem = rootDirectory.getFileSystem(conf); this.rootDirectory = rootFileSystem.makeQualified(rootDirectory); } catch (IOException ex) { throw new MetadataProviderException("Cannot get FileSystem for root path", ex); } }