Example usage for org.apache.hadoop.fs Path suffix

List of usage examples for org.apache.hadoop.fs Path suffix

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path suffix.

Prototype

public Path suffix(String suffix) 

Source Link

Document

Adds a suffix to the final name in the path.

Usage

From source file:com.aliyun.fs.oss.TestAliyunOSSFileSystemStore.java

License:Apache License

protected void writeRenameReadCompare(Path path, long len) throws IOException, NoSuchAlgorithmException {
    // If len > fs.oss.multipart.upload.threshold,
    // we'll use a multipart upload copy
    MessageDigest digest = MessageDigest.getInstance("MD5");
    OutputStream out = new BufferedOutputStream(new DigestOutputStream(fs.create(path, false), digest));
    for (long i = 0; i < len; i++) {
        out.write('Q');
    }//from   w w  w . j a  va2  s . co  m
    out.flush();
    out.close();

    assertTrue("Exists", fs.exists(path));

    Path copyPath = path.suffix(".copy");
    fs.rename(path, copyPath);

    assertTrue("Copy exists", fs.exists(copyPath));

    // Download file from Aliyun OSS and compare the digest against the original
    MessageDigest digest2 = MessageDigest.getInstance("MD5");
    InputStream in = new BufferedInputStream(new DigestInputStream(fs.open(copyPath), digest2));
    long copyLen = 0;
    while (in.read() != -1) {
        copyLen++;
    }
    in.close();

    assertEquals("Copy length matches original", len, copyLen);
    assertArrayEquals("Digests match", digest.digest(), digest2.digest());
}

From source file:com.btoddb.chronicle.apps.AvroTools.java

License:Open Source License

private void testFileAndFix(Path inFile) throws IOException {
    FileContext context = FileContext.getFileContext(hdfsConfig);
    AvroFSInput input = new AvroFSInput(context, inFile);

    ReflectDatumReader<Object> reader = new ReflectDatumReader<>();
    FileReader<Object> fileReader = DataFileReader.openReader(input, reader);

    Path outFile = inFile.suffix(".fixing");
    FSDataOutputStream output = FileSystem.create(outFile.getFileSystem(hdfsConfig), outFile,
            FsPermission.getDefault());/* w  w w .ja va2s  .  c o  m*/
    DataFileWriter<Object> writer = new DataFileWriter<>(new GenericDatumWriter<>());
    writer.setCodec(CodecFactory.snappyCodec());

    boolean corrupted = false;
    long count = 0;

    try {
        Schema schema = fileReader.getSchema();
        writer.create(schema, output);

        for (;;) {
            try {
                if (fileReader.hasNext()) {
                    Object obj = fileReader.next();
                    count++;
                    writer.append(obj);
                } else {
                    break;
                }
            } catch (AvroRuntimeException e) {
                corrupted = true;
                System.out.println("  - file pointer = " + input.tell());
                if (e.getCause() instanceof EOFException) {
                    System.out.println("  - EOF occurred so we're done : " + e.getMessage());
                    break;
                } else if (e.getCause() instanceof IOException) {
                    System.out.println("  - will try to 'next' past the error : " + e.getMessage());
                    try {
                        fileReader.next();
                        System.out.println("  - 'next' worked - didn't really expect it to, but great!");
                    } catch (Exception e2) {
                        System.out.println("  - 'next' did not work - will continue on and see what happens : "
                                + e2.getMessage());
                    }
                    continue;
                }
                break;
            } catch (Exception e) {
                corrupted = true;
                System.out.println("  - file pointer = " + input.tell());
                e.printStackTrace();
                break;
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        System.out.println(("  - processed " + count + " records"));
        if (null != fileReader) {
            try {
                fileReader.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        if (null != writer) {
            try {
                writer.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    if (!corrupted) {
        outFile.getFileSystem(hdfsConfig).delete(outFile, false);
    } else {
        outFile.getFileSystem(hdfsConfig).rename(outFile, inFile.suffix(".fixed"));
    }
}

From source file:com.cloudera.impala.catalog.HBaseTable.java

License:Apache License

/**
 * Returns the Hdfs size of the given region in bytes. NULL can be
 * passed as a parameter to retrieve the size of the complete table.
 *///  ww  w.  j a v a  2 s.  co m
public long getHdfsSize(HRegionInfo info) throws IOException {
    Path tableDir = HTableDescriptor.getTableDir(FSUtils.getRootDir(hbaseConf_),
            Bytes.toBytes(hbaseTableName_));
    FileSystem fs = tableDir.getFileSystem(hbaseConf_);
    if (info != null) {
        Path regionDir = tableDir.suffix("/" + info.getEncodedName());
        return fs.getContentSummary(regionDir).getLength();
    } else {
        return fs.getContentSummary(tableDir).getLength();
    }
}

From source file:com.fullcontact.sstable.index.SSTableIndexIndex.java

License:Apache License

/**
 * Read an existing index. Reads and returns the index index, which is a list of chunks defined by the Cassandra
 * Index.db file along with the configured split size.
 *
 * @param fileSystem Hadoop file system.
 * @param sstablePath SSTable Index.db./*  w  w w  .j a  v  a  2 s.  c  om*/
 * @return Index of chunks.
 * @throws IOException
 */
public static SSTableIndexIndex readIndex(final FileSystem fileSystem, final Path sstablePath)
        throws IOException {
    final Closer closer = Closer.create();
    final Path indexPath = sstablePath.suffix(SSTABLE_INDEX_SUFFIX);

    // Detonate if we don't have an index.
    final FSDataInputStream inputStream = closer.register(fileSystem.open(indexPath));

    final SSTableIndexIndex indexIndex = new SSTableIndexIndex();
    try {
        while (inputStream.available() != 0) {
            indexIndex.add(inputStream.readLong(), inputStream.readLong());
        }
    } finally {
        closer.close();
    }

    return indexIndex;
}

From source file:com.fullcontact.sstable.index.SSTableIndexIndex.java

License:Apache License

/**
 * Create and write an index index based on the input Cassandra Index.db file. Read the Index.db and generate chunks
 * (splits) based on the configured chunk size.
 *
 * @param fileSystem Hadoop file system.
 * @param sstablePath SSTable Index.db./*from www . jav  a 2 s.  com*/
 * @throws IOException
 */
public static void writeIndex(final FileSystem fileSystem, final Path sstablePath) throws IOException {

    final Configuration configuration = fileSystem.getConf();

    final long splitSize = configuration.getLong(HadoopSSTableConstants.HADOOP_SSTABLE_SPLIT_MB,
            HadoopSSTableConstants.DEFAULT_SPLIT_MB) * 1024 * 1024;

    final Closer closer = Closer.create();

    final Path outputPath = sstablePath.suffix(SSTABLE_INDEX_SUFFIX);
    final Path inProgressOutputPath = sstablePath.suffix(SSTABLE_INDEX_IN_PROGRESS_SUFFIX);

    boolean success = false;
    try {
        final FSDataOutputStream os = closer.register(fileSystem.create(inProgressOutputPath));

        final TLongArrayList splitOffsets = new TLongArrayList();
        long currentStart = 0;
        long currentEnd = 0;
        final IndexOffsetScanner index = new IndexOffsetScanner(sstablePath, fileSystem);

        while (index.hasNext()) {
            // NOTE: This does not give an exact size of this split in bytes but a rough estimate.
            // This should be good enough since it's only used for sorting splits by size in hadoop land.
            while (currentEnd - currentStart < splitSize && index.hasNext()) {
                currentEnd = index.next();
                splitOffsets.add(currentEnd);
            }

            // Record the split
            final long[] offsets = splitOffsets.toArray();
            os.writeLong(offsets[0]); // Start
            os.writeLong(offsets[offsets.length - 1]); // End

            // Clear the offsets
            splitOffsets.clear();

            if (index.hasNext()) {
                currentStart = index.next();
                currentEnd = currentStart;
                splitOffsets.add(currentStart);
            }
        }

        success = true;
    } finally {
        closer.close();

        if (!success) {
            fileSystem.delete(inProgressOutputPath, false);
        } else {
            fileSystem.rename(inProgressOutputPath, outputPath);
        }
    }
}

From source file:com.hadoop.compression.lzo.LzoIndex.java

License:Open Source License

/**
 * Read the index of the lzo file./*from   www. j  av  a  2  s  .  co  m*/
        
 * @param fs The index file is on this file system.
 * @param lzoFile the file whose index we are reading -- NOT the index file itself.  That is,
 * pass in filename.lzo, not filename.lzo.index, for this parameter.
 * @throws IOException
 */
public static LzoIndex readIndex(FileSystem fs, Path lzoFile) throws IOException {
    FSDataInputStream indexIn = null;
    Path indexFile = lzoFile.suffix(LZO_INDEX_SUFFIX);

    try {
        indexIn = fs.open(indexFile);
    } catch (IOException fileNotFound) {
        // return empty index, fall back to the unsplittable mode
        return new LzoIndex();
    }

    int capacity = 16 * 1024 * 8; //size for a 4GB file (with 256KB lzo blocks)
    DataOutputBuffer bytes = new DataOutputBuffer(capacity);

    // copy indexIn and close it
    IOUtils.copyBytes(indexIn, bytes, 4 * 1024, true);

    ByteBuffer bytesIn = ByteBuffer.wrap(bytes.getData(), 0, bytes.getLength());
    int blocks = bytesIn.remaining() / 8;
    LzoIndex index = new LzoIndex(blocks);

    for (int i = 0; i < blocks; i++) {
        index.set(i, bytesIn.getLong());
    }

    return index;
}

From source file:com.hadoop.compression.lzo.LzoIndex.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs.//  w  w  w. j  av  a2s.  c om
 *
 * @param fs File system that contains the file.
 * @param lzoFile the lzo file to index.  For filename.lzo, the created index file will be
 * filename.lzo.index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(lzoFile);
    if (null == codec) {
        throw new IOException("Could not find codec for file " + lzoFile
                + " - you may need to add the LZO codec to your io.compression.codecs "
                + "configuration in core-site.xml");
    }
    ((Configurable) codec).setConf(conf);

    FSDataInputStream is = null;
    FSDataOutputStream os = null;
    Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX);
    Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX);

    // Track whether an exception was thrown or not, so we know to either
    // delete the tmp index file on failure, or rename it to the new index file on success.
    boolean indexingSucceeded = false;
    try {
        is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // Solely for reading the header
        codec.createInputStream(is, decompressor);
        int numCompressedChecksums = decompressor.getCompressedChecksumsCount();
        int numDecompressedChecksums = decompressor.getDecompressedChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            // See LzopInputStream.getCompressedData
            boolean isUncompressedBlock = (uncompressedBlockSize == compressedBlockSize);
            int numChecksumsToSkip = isUncompressedBlock ? numDecompressedChecksums
                    : numDecompressedChecksums + numCompressedChecksums;
            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip));
        }
        // If we're here, indexing was successful.
        indexingSucceeded = true;
    } finally {
        // Close any open streams.
        if (is != null) {
            is.close();
        }

        if (os != null) {
            os.close();
        }

        if (!indexingSucceeded) {
            // If indexing didn't succeed (i.e. an exception was thrown), clean up after ourselves.
            fs.delete(tmpOutputFile, false);
        } else {
            // Otherwise, rename filename.lzo.index.tmp to filename.lzo.index.
            fs.rename(tmpOutputFile, outputFile);
        }
    }
}

From source file:com.hadoop.mapreduce.LzoTextInputFormat.java

License:Open Source License

/**
 * Index an lzo file to allow the input format to split them into separate map
 * jobs.//from   www  . java 2  s.c  o m
 * 
 * @param fs
 *          File system that contains the file.
 * @param lzoFile
 *          the lzo file to index.
 * @throws IOException
 */
public static void createIndex(FileSystem fs, Path lzoFile) throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
    CompressionCodec codec = factory.getCodec(lzoFile);
    ((Configurable) codec).setConf(conf);

    InputStream lzoIs = null;
    FSDataOutputStream os = null;
    Path outputFile = new Path(lzoFile.toString() + LzoTextInputFormat.LZO_INDEX_SUFFIX);
    Path tmpOutputFile = outputFile.suffix(".tmp");

    try {
        FSDataInputStream is = fs.open(lzoFile);
        os = fs.create(tmpOutputFile);
        LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
        // for reading the header
        lzoIs = codec.createInputStream(is, decompressor);

        int numChecksums = decompressor.getChecksumsCount();

        while (true) {
            // read and ignore, we just want to get to the next int
            int uncompressedBlockSize = is.readInt();
            if (uncompressedBlockSize == 0) {
                break;
            } else if (uncompressedBlockSize < 0) {
                throw new EOFException();
            }

            int compressedBlockSize = is.readInt();
            if (compressedBlockSize <= 0) {
                throw new IOException("Could not read compressed block size");
            }

            long pos = is.getPos();
            // write the pos of the block start
            os.writeLong(pos - 8);
            // seek to the start of the next block, skip any checksums
            is.seek(pos + compressedBlockSize + (4 * numChecksums));
        }
    } finally {
        if (lzoIs != null) {
            lzoIs.close();
        }

        if (os != null) {
            os.close();
        }
    }

    fs.rename(tmpOutputFile, outputFile);
}

From source file:com.liferay.hadoop.search.HadoopDLIndexerPostProcessor.java

License:Open Source License

public void postProcessDocument(Document document, Object obj) throws Exception {

    DLFileEntry dlFileEntry = (DLFileEntry) obj;

    long companyId = dlFileEntry.getCompanyId();
    long repositoryId = dlFileEntry.getRepositoryId();

    String stringObject = document.toString();

    // remove JSON chars

    stringObject = StringUtil.replace(stringObject, new String[] { "\"", ",", ":", "{", "}", "[", "]" },
            new String[] { StringPool.SPACE, StringPool.SPACE, StringPool.SPACE, StringPool.SPACE,
                    StringPool.SPACE, StringPool.SPACE, StringPool.SPACE });

    Path fullDirPath = HadoopManager.getFullDirPath(companyId, repositoryId, null);

    fullDirPath = new Path("/index".concat(fullDirPath.toString()));

    FSDataOutputStream outputStream = null;

    try {/*from  w w w  . j a va 2  s. com*/
        FileSystem fileSystem = HadoopManager.getFileSystem();

        String suffix = StringPool.SLASH.concat(document.getUID());

        outputStream = fileSystem.create(fullDirPath.suffix(suffix));

        PrintWriter pw = new PrintWriter(outputStream);

        pw.write(stringObject);

        pw.flush();
        pw.close();
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        StreamUtil.cleanUp(outputStream);
    }
}

From source file:com.nagarro.nteg.utils.HDFSFileDataBufferedReader.java

License:Apache License

/**
 * @param path//w w w. jav a 2  s  .co  m
 * @param batchSize
 * @throws IOException
 */
public HDFSFileDataBufferedReader(final Path path, final int batchSize) throws IOException {
    this.batchSize = batchSize;
    this.currentFile = path;
    this.buffer = new ArrayDeque<String>(batchSize);
    internalRenamedFile = path.suffix(IN_PROGRESS_FILE_SUFFIX);

    fileSystem = path.getFileSystem(new Configuration());

    fileSystem.rename(path, internalRenamedFile);

    scanner = new Scanner(fileSystem.open(internalRenamedFile));
}