List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:com.davidgildeh.hadoop.utils.FileUtils.java
License:Apache License
/** * Merges a list of input files in a directory to a single file under the * outputpath with a specified filename//from w ww .j av a 2s .c om * * @param inputPath The input directory containing all the input files. E.g. /input/dir/on/hdfs/ * @param outputPath The output path to output the file. E.g. /output/dir/on/hdfs/filename * @throws IOException */ public static void mergeFiles(String inputPath, String outputPath) throws IOException { Path inputDir = new Path(inputPath); Path outputFile = new Path(outputPath); FileSystem fileSystem = getFileSystem(outputFile); checkFileExists(fileSystem, inputDir); // Check the input path is a directory if (!fileSystem.getFileStatus(inputDir).isDir()) { LOG.error("Path '" + inputDir.toString() + "' is not a directory."); throw new IOException("Path '" + inputDir.toString() + "' is not a directory."); } // Create Output File OutputStream out = fileSystem.create(outputFile); try { FileStatus contents[] = fileSystem.listStatus(inputDir); // Loop through all files in directory and merge them into one file for (int i = 0; i < contents.length; i++) { if (!contents[i].isDir()) { InputStream in = fileSystem.open(contents[i].getPath()); try { IOUtils.copyBytes(in, out, fileSystem.getConf(), false); } finally { in.close(); } } } } finally { out.close(); fileSystem.close(); LOG.info("Merged input files from '" + inputPath + "' to '" + outputPath + "'"); } }
From source file:com.dianping.cat.hadoop.hdfs.MessageBlockReader.java
License:Open Source License
public MessageBlockReader(FileSystem fs, Path basePath, String dataFile) throws IOException { m_indexFile = fs.open(new Path(basePath, dataFile + ".idx")); m_dataFile = fs.open(new Path(basePath, dataFile)); }
From source file:com.dianping.cat.hadoop.hdfs.MessageBlockReader.java
License:Open Source License
public MessageBlockReader(FileSystem fs, String dataFile) throws IOException { m_indexFile = fs.open(new Path(dataFile + ".idx")); m_dataFile = fs.open(new Path(dataFile)); }
From source file:com.dinglicom.clouder.mapreduce.input.LineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; System.out.println("-------------------length:" + split.getLength() + "\tposition:" + split.getStart()); Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();//from www. j a v a 2s .co m end = start + split.getLength(); final Path file = split.getPath(); key = new Text(FileToCDRType.getTypeByPath(file.getName())); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.ds.lzo.DeprecatedLzoLineRecordReaderForCombined.java
License:Open Source License
public DeprecatedLzoLineRecordReaderForCombined(Configuration conf, FileSplit split) throws IOException { LOG.warn("split start: " + split.getStart()); LOG.warn("split length: " + split.getLength()); String[] locs = split.getLocations(); for (String loc : locs) { LOG.warn("location: " + loc); }// www .j a v a 2s .c o m start = split.getStart(); end = start + split.getLength(); LOG.warn("split end: " + end); final Path file = split.getPath(); LOG.warn("file: " + file.getName()); LOG.warn("INT split start: " + (int) split.getStart()); LOG.warn("INT split length: " + (int) split.getLength()); LOG.warn("INT split end: " + (int) end); FileSystem fs = file.getFileSystem(conf); codecFactory = new CompressionCodecFactory(conf); final CompressionCodec codec = codecFactory.getCodec(file); LOG.warn("codec: " + codec.toString()); LOG.warn("config: " + conf.toString()); if (codec == null) { throw new IOException("No LZO codec found, cannot run."); } // Open the file and seek to the next split. fileIn = fs.open(file); // Create input stream and read the file header. in = new LineReader(codec.createInputStream(fileIn), conf); if (start != 0) { fileIn.seek(start); LOG.warn("fileIn position: " + fileIn.getPos()); LOG.warn("buffer size: " + conf.get("io.file.buffer.size")); // Read and ignore the first line. in.readLine(new Text()); start = fileIn.getPos(); } pos = start; }
From source file:com.epam.catgenome.manager.bam.BamHelper.java
License:Open Source License
private SamInputResource getHDFSIndex(SamInputResource samInputResource, BiologicalDataItem indexFile) throws IOException { URI uriIndex = URI.create(indexFile.getPath()); Configuration conf = new Configuration(); FileSystem fileBam = FileSystem.get(uriIndex, conf); FSDataInputStream indexStream = fileBam.open(new Path(uriIndex)); return samInputResource.index(new HdfsSeekableInputStream(indexStream)); }
From source file:com.epam.catgenome.manager.bam.BamHelper.java
License:Open Source License
@NotNull private SamInputResource getHDFSSamInputResource(BamFile bamFile) throws IOException { final URI uriBam = URI.create(bamFile.getPath()); final Configuration conf = new Configuration(); final FileSystem fileBam = FileSystem.get(uriBam, conf); final FSDataInputStream inBam = fileBam.open(new Path(uriBam)); return SamInputResource.of(new HdfsSeekableInputStream(inBam)); }
From source file:com.ery.hadoop.mrddx.file.LineRecordReader.java
License:Apache License
void openFile() throws IOException { start = split.getStart();/* w w w . j a v a 2s.c o m*/ end = start + split.getLength(); final Path file = split.getPath(); LOG.info("split.getFileIndex=" + split.getFileIndex() + ",file.path=" + file.toString() + " fileEncodeing=" + fileEncodeing + " " + split.getStart() + ":" + split.getLength()); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); if (file.getName().endsWith(".zip")) { LOG.info("use ZipInputStream read file " + split.getPath()); ZipInputStream zin = new ZipInputStream(fileIn, Charset.forName(fileEncodeing)); in = new LineReader(zin, job); filePosition = fileIn; codec = new GzipCodec(); return; } if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); // tar.gzTarInputStream // new TarInputStream(codec.createInputStream(fileIn, // decompressor) String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(cIn), job); } else { in = new LineReader(cIn, job); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { String filename = file.getName(); if (filename.endsWith(".tar.gz") || filename.endsWith(".tar")) { in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } filePosition = fileIn; } } else { fileIn.seek(start); String filename = file.getName(); if (filename.endsWith(".tar")) { in = new LineReader(new TarInputStream(fileIn), job); } else { in = new LineReader(fileIn, job); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.ery.hadoop.mrddx.hFile.LineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from ww w . j a va2 s .co m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); // tar.gzTarInputStream // new TarInputStream(codec.createInputStream(fileIn, decompressor) String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(cIn), job); } else { in = new LineReader(cIn, job); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { String filename = file.getName(); if (filename.endsWith(".tar.gz")) { in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
From source file:com.examples.ch02.HdfsReader_Ex_2.java
public int run(String[] args) throws Exception { Path inputPath = new Path(args[0]); String localOutputPath = args[1]; Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); InputStream is = fs.open(inputPath); OutputStream os = new BufferedOutputStream(new FileOutputStream(localOutputPath)); IOUtils.copyBytes(is, os, conf);// w w w .j av a 2 s . co m return 0; }