List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:com.hadoop.compression.fourmc.FourMzInputStream.java
License:BSD License
/** * Reads blocks index at tail of file.//from ww w . j a v a 2 s .c om * * @param fs filesystem * @param file path to 4mc file * @return block index * @throws IOException */ public static FourMzBlockIndex readIndex(FileSystem fs, Path file) throws IOException { long fileSize = fs.getFileStatus(file).getLen(); if (fileSize < (12 + 20)) { // file too small return new FourMzBlockIndex(); } FSDataInputStream indexIn = fs.open(file); /** * jump to file tail and read-ahead last 4KB of file which should be enough in most cases * Improvement: we could estimate a best case compression factor of 10% and calc forecast * based on filesize and blocksize, to see if better to read-head more. */ int readTailSize = 4 * 1024; if (readTailSize > (fileSize - 12)) readTailSize = (int) (fileSize - 12); indexIn.seek(fileSize - readTailSize); byte[] buf = new byte[readTailSize]; readFully(indexIn, buf, 0, buf.length); int footerSize = getInt(buf, buf.length - 12); int magic = getInt(buf, buf.length - 8); int checksum = getInt(buf, buf.length - 4); if (magic != FourMzCodec.FOURMZ_MAGIC) { throw new IOException("Invalid 4mc footer magic"); } if (footerSize >= (fileSize - 12)) { throw new IOException("Invalid 4mc footer checksum"); } // very rare case: read head was not enough! seek back and read it all if (footerSize > readTailSize) { readTailSize = footerSize; indexIn.seek(fileSize - readTailSize); buf = new byte[readTailSize]; readFully(indexIn, buf, 0, buf.length); } indexIn.close(); int startFooterOffset = readTailSize - footerSize; if (getInt(buf, startFooterOffset) != footerSize) { // size again throw new IOException("Invalid 4mc footer size"); } if (getInt(buf, startFooterOffset + 4) != FourMzCodec.FOURMZ_VERSION) { // version throw new IOException("Invalid 4mc footer version (" + getInt(buf, startFooterOffset + 4) + ")"); } if (checksum != ZstdDecompressor.xxhash32(buf, startFooterOffset, footerSize - 4, 0)) { throw new IOException("Invalid 4mc footer checksum"); } int totalBlocks = (footerSize - 20) / 4; FourMzBlockIndex index = new FourMzBlockIndex(totalBlocks); long curOffset = 0; for (int i = 0; i < totalBlocks; ++i) { curOffset += getInt(buf, startFooterOffset + 8 + (i * 4)); index.set(i, curOffset); } return index; }
From source file:com.hadoop.compression.lzo.LzoIndex.java
License:Open Source License
/** * Read the index of the lzo file./*from w w w . j a v a2 s . c o m*/ * @param fs The index file is on this file system. * @param lzoFile the file whose index we are reading -- NOT the index file itself. That is, * pass in filename.lzo, not filename.lzo.index, for this parameter. * @throws IOException */ public static LzoIndex readIndex(FileSystem fs, Path lzoFile) throws IOException { FSDataInputStream indexIn = null; Path indexFile = lzoFile.suffix(LZO_INDEX_SUFFIX); try { indexIn = fs.open(indexFile); } catch (IOException fileNotFound) { // return empty index, fall back to the unsplittable mode return new LzoIndex(); } int capacity = 16 * 1024 * 8; //size for a 4GB file (with 256KB lzo blocks) DataOutputBuffer bytes = new DataOutputBuffer(capacity); // copy indexIn and close it IOUtils.copyBytes(indexIn, bytes, 4 * 1024, true); ByteBuffer bytesIn = ByteBuffer.wrap(bytes.getData(), 0, bytes.getLength()); int blocks = bytesIn.remaining() / 8; LzoIndex index = new LzoIndex(blocks); for (int i = 0; i < blocks; i++) { index.set(i, bytesIn.getLong()); } return index; }
From source file:com.hadoop.compression.lzo.LzoIndex.java
License:Open Source License
/** * Index an lzo file to allow the input format to split them into separate map * jobs.//from w w w . j ava2s.co m * * @param fs File system that contains the file. * @param lzoFile the lzo file to index. For filename.lzo, the created index file will be * filename.lzo.index. * @throws IOException */ public static void createIndex(FileSystem fs, Path lzoFile) throws IOException { Configuration conf = fs.getConf(); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(lzoFile); if (null == codec) { throw new IOException("Could not find codec for file " + lzoFile + " - you may need to add the LZO codec to your io.compression.codecs " + "configuration in core-site.xml"); } ((Configurable) codec).setConf(conf); FSDataInputStream is = null; FSDataOutputStream os = null; Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX); Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX); // Track whether an exception was thrown or not, so we know to either // delete the tmp index file on failure, or rename it to the new index file on success. boolean indexingSucceeded = false; try { is = fs.open(lzoFile); os = fs.create(tmpOutputFile); LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor(); // Solely for reading the header codec.createInputStream(is, decompressor); int numCompressedChecksums = decompressor.getCompressedChecksumsCount(); int numDecompressedChecksums = decompressor.getDecompressedChecksumsCount(); while (true) { // read and ignore, we just want to get to the next int int uncompressedBlockSize = is.readInt(); if (uncompressedBlockSize == 0) { break; } else if (uncompressedBlockSize < 0) { throw new EOFException(); } int compressedBlockSize = is.readInt(); if (compressedBlockSize <= 0) { throw new IOException("Could not read compressed block size"); } // See LzopInputStream.getCompressedData boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize); int numChecksumsToSkip = isUncompressedBlock ? numDecompressedChecksums : numDecompressedChecksums + numCompressedChecksums; long pos = is.getPos(); // write the pos of the block start os.writeLong(pos - 8); // seek to the start of the next block, skip any checksums is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip)); } // If we're here, indexing was successful. indexingSucceeded = true; } finally { // Close any open streams. if (is != null) { is.close(); } if (os != null) { os.close(); } if (!indexingSucceeded) { // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves. fs.delete(tmpOutputFile, false); } else { // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index. fs.rename(tmpOutputFile, outputFile); } } }
From source file:com.hadoop.mapred.DeprecatedLzoLineRecordReader.java
License:Open Source License
DeprecatedLzoLineRecordReader(Configuration conf, FileSplit split) throws IOException { start = split.getStart();/*ww w. jav a2 s .c o m*/ end = start + split.getLength(); final Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); codecFactory = new CompressionCodecFactory(conf); final CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) { throw new IOException("No LZO codec found, cannot run."); } // Open the file and seek to the next split. fileIn = fs.open(file); // Create input stream and read the file header. in = new LineReader(codec.createInputStream(fileIn), conf); if (start != 0) { fileIn.seek(start); // Read and ignore the first line. in.readLine(new Text()); start = fileIn.getPos(); } pos = start; }
From source file:com.hadoop.mapreduce.FourMcLineRecordReader.java
License:BSD License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; start = split.getStart();/*from w w w .jav a2 s . c o m*/ end = start + split.getLength(); final Path file = split.getPath(); Configuration job = HadoopUtils.getConfiguration(context); maxLineLen = job.getInt(MAX_LINE_LEN_CONF, Integer.MAX_VALUE); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("Codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header in = new LineReader(codec.createInputStream(fileIn), job); if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:com.hadoop.mapreduce.LzoLineRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; start = split.getStart();//from ww w . j a va2 s.c o m end = start + split.getLength(); final Path file = split.getPath(); Configuration job = context.getConfiguration(); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("No codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header in = new LineReader(codec.createInputStream(fileIn), job); if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:com.hadoop.mapreduce.LzoTextInputFormat.java
License:Open Source License
/** * Read the index of the lzo file.//from www. ja v a2s. com * * @param split * Read the index of this file. * @param fs * The index file is on this file system. * @throws IOException */ private LzoIndex readIndex(Path file, FileSystem fs) throws IOException { FSDataInputStream indexIn = null; try { Path indexFile = new Path(file.toString() + LZO_INDEX_SUFFIX); if (!fs.exists(indexFile)) { // return empty index, fall back to the unsplittable mode return new LzoIndex(); } long indexLen = fs.getFileStatus(indexFile).getLen(); int blocks = (int) (indexLen / 8); LzoIndex index = new LzoIndex(blocks); indexIn = fs.open(indexFile); for (int i = 0; i < blocks; i++) { index.set(i, indexIn.readLong()); } return index; } finally { if (indexIn != null) { indexIn.close(); } } }
From source file:com.hadoop.mapreduce.LzoTextInputFormat.java
License:Open Source License
/** * Index an lzo file to allow the input format to split them into separate map * jobs./*from ww w. j a v a 2s . c om*/ * * @param fs * File system that contains the file. * @param lzoFile * the lzo file to index. * @throws IOException */ public static void createIndex(FileSystem fs, Path lzoFile) throws IOException { Configuration conf = fs.getConf(); CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf()); CompressionCodec codec = factory.getCodec(lzoFile); ((Configurable) codec).setConf(conf); InputStream lzoIs = null; FSDataOutputStream os = null; Path outputFile = new Path(lzoFile.toString() + LzoTextInputFormat.LZO_INDEX_SUFFIX); Path tmpOutputFile = outputFile.suffix(".tmp"); try { FSDataInputStream is = fs.open(lzoFile); os = fs.create(tmpOutputFile); LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor(); // for reading the header lzoIs = codec.createInputStream(is, decompressor); int numChecksums = decompressor.getChecksumsCount(); while (true) { // read and ignore, we just want to get to the next int int uncompressedBlockSize = is.readInt(); if (uncompressedBlockSize == 0) { break; } else if (uncompressedBlockSize < 0) { throw new EOFException(); } int compressedBlockSize = is.readInt(); if (compressedBlockSize <= 0) { throw new IOException("Could not read compressed block size"); } long pos = is.getPos(); // write the pos of the block start os.writeLong(pos - 8); // seek to the start of the next block, skip any checksums is.seek(pos + compressedBlockSize + (4 * numChecksums)); } } finally { if (lzoIs != null) { lzoIs.close(); } if (os != null) { os.close(); } } fs.rename(tmpOutputFile, outputFile); }
From source file:com.hdfs.concat.crush.CountersInputFormat.java
License:Apache License
@Override public RecordReader<Counters, NullWritable> getRecordReader(InputSplit inputSplit, JobConf jobconf, Reporter reporter) throws IOException { if (!(inputSplit instanceof FileSplit)) { throw new AssertionError(); }/*from w ww.j a v a 2s.c om*/ FileSplit fSplit = (FileSplit) inputSplit; Path path = fSplit.getPath(); long length = fSplit.getLength(); FileSystem fs = FileSystem.get(jobconf); FSDataInputStream is = fs.open(path); return new CountersReader(is, length); }
From source file:com.hdfstoftp.main.HdfsToFtp.java
/** * ?// ww w . j a v a 2s. co m * * @param srcFS * * @param src * ? * @param dst * * @param queryStr * * @param deleteSource * ?? * @param overwrite * ???? * @return boolean * @throws Exception */ private static boolean copyFromHDFSToFTP(Config config) throws Exception { // ?hdfs Configuration conf = new Configuration(); FileSystem srcFS = FileSystem.get(conf); long start = System.currentTimeMillis(); boolean isRename = config.isRenameUploaded(); int retryTimes = config.getRetryTimes(); // ? String dstPath = config.getDestDir(); Path src = new Path(config.getSouceDir()); FileStatus fileStatus = srcFS.getFileStatus(src); String subDir = null; if (fileStatus.isDirectory()) {// if (isRename) {// ??rename subDir = Config.RENAME_DIR; srcFS.mkdirs(new Path(fileStatus.getPath(), subDir)); } int threadNum = config.getThreadNum(); // ExecutorService threadPool = Executors.newFixedThreadPool(threadNum); // ?ftp FTPClientPool ftpPool = new FTPClientPool(threadNum, new FtpClientFactory(config.getFTPClientConfig())); FTPClient ftpClient = ftpPool.borrowObject(); // ? ftpClient.makeDirectory(dstPath); ftpPool.returnObject(ftpClient); // ?? FileStatus contents[] = srcFS.listStatus(src); long beginFilter = 0; long endFileter = 0; if (config.getCommandLine().hasOption("d") || config.getCommandLine().hasOption("h") || config.getCommandLine().hasOption("t")) {// ?"[" beginFilter = System.currentTimeMillis(); Long[] timeRange = parseTimeRange(config.getCommandLine()); contents = getNewContents(timeRange, contents); endFileter = System.currentTimeMillis(); } // ? if (config.getCommandLine().hasOption("r")) {// "["?? beginFilter = System.currentTimeMillis(); contents = getFilterContents(config.getCommandLine().getOptionValue("r").trim(), contents); endFileter = System.currentTimeMillis(); } logger.info("total file count:" + contents.length); Map<String, String> fileNameMap = null; long beginSkip = 0; long endSkip = 0; boolean overwrite = true; if (config.getCommandLine().hasOption("o")) { overwrite = "true".equals(config.getCommandLine().getOptionValue("o").trim()); } if (!overwrite) {// ????? beginSkip = System.currentTimeMillis(); fileNameMap = getFileNameMap(dstPath, ftpPool); endSkip = System.currentTimeMillis(); } int skiped = 0; List<Future<?>> futureList = new ArrayList<Future<?>>(); for (int i = 0; i < contents.length; i++) { if (!overwrite && fileNameMap.containsKey(contents[i].getPath().getName())) { // skiped++; Log.info("skiped filename:" + contents[i].getPath().getName()); continue; } if (contents[i].isDirectory()) { continue; } // ??? Future<?> future = threadPool.submit(new UploadFileTask(srcFS, contents[i].getPath(), new Path(dstPath, contents[i].getPath().getName()), ftpPool, false, isRename, subDir, retryTimes)); futureList.add(future); } int transfered = 0; int failed = 0; for (Future<?> future : futureList) { Boolean computeResult = (Boolean) future.get(); if (computeResult) { transfered++; if (transfered % 50 == 0 || transfered == contents.length) { logger.info("have transfered:" + transfered + " files"); } } else { failed++; logger.error("failed transter:" + failed + " files"); } } // threadPool.shutdown(); // FTPCient ftpPool.close(); // **************** logger.info("filter time:" + (endFileter - beginFilter) + " ms"); if (!overwrite) { logger.info("skip time:" + (endSkip - beginSkip) + " ms"); } logger.info("total file count:" + contents.length); logger.info("total transtered: " + transfered + ",total failed:" + failed + ",total skiped:" + skiped); } else {// BufferedReader reader = null; FtpClientFactory facotry = new FtpClientFactory(config.getFTPClientConfig()); FTPClient ftpClient = null; InputStream in = null; try { Path path = fileStatus.getPath(); if (!path.getName().contains("log")) { } reader = new BufferedReader(new FileReader(new File(path.toUri().getPath()))); String str = null; ftpClient = facotry.makeObject(); while ((str = reader.readLine()) != null) { String[] feilds = str.split("&"); Path filePath = null; if (feilds.length == 2 && feilds[1] != "") { filePath = new Path(feilds[1]); in = srcFS.open(filePath); boolean result = ftpClient.storeFile(dstPath, in); System.out.println(ftpClient.getReplyCode()); if (result) { logger.info(filePath.toString()); } else { logger_failed.info(filePath.toString()); } } else { continue; } } } catch (Exception e) { e.printStackTrace(); } finally { in.close(); reader.close(); facotry.destroyObject(ftpClient); } } long end = System.currentTimeMillis(); logger.info("finished transfer,total time:" + (end - start) / 1000 + "s"); return true; }