Example usage for org.apache.hadoop.fs FileSystem open

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem open.

Prototype

public FSDataInputStream open(PathHandle fd) throws IOException

Source Link

Document

Open an FSDataInputStream matching the PathHandle instance.

Usage

From source file:cn.lhfei.hadoop.ch04.FileDecompressor.java

License:Apache License

/**
 * use case: % hadoop FileDecompressor file.gz
 * @param args//from   ww  w  . j a v  a2s  . com
 */
public static void main(String[] args) {
    FileSystem fs = null;
    String uri = args[0];
    Path inputPath = null;
    Configuration conf = new Configuration();
    CompressionCodecFactory factory = null;

    InputStream in = null;
    OutputStream out = null;

    try {
        fs = FileSystem.get(URI.create(uri), conf);
        inputPath = new Path(uri);
        factory = new CompressionCodecFactory(conf);
        CompressionCodec codec = factory.getCodec(inputPath);
        if (codec == null) {
            System.err.println("No codec found for " + uri);
            System.exit(1);
        }

        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());

        in = codec.createInputStream(fs.open(inputPath));
        out = fs.create(new Path(outputUri));

        IOUtils.copyBytes(in, out, conf);

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        IOUtils.closeStream(in);
        IOUtils.closeStream(out);
    }
}

From source file:cn.uc.hadoop.mapreduce.lib.input.FileNameLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/*from   w  w w  . j  a v  a2s . c o  m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    //ADD by qiujw key??
    key = new Text(file.getName());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:cn.uc.hadoop.mapreduce.lib.input.FilePathLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/*from   w  w  w  .  java 2 s  . c om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    //ADD by qiujw key?
    key = new Text(file.toString());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}

From source file:co.cask.cdap.common.io.Locations.java

License:Apache License

/**
 * Creates a new {@link InputSupplier} that can provides {@link SeekableInputStream} of the given path.
 *
 * @param fs The {@link org.apache.hadoop.fs.FileSystem} for the given path.
 * @param path The path to create {@link co.cask.cdap.common.io.SeekableInputStream} when requested.
 * @return A {@link InputSupplier}.//from   ww w  . j  av a2  s . com
 */
public static InputSupplier<? extends SeekableInputStream> newInputSupplier(final FileSystem fs,
        final Path path) {
    return new InputSupplier<SeekableInputStream>() {
        @Override
        public SeekableInputStream getInput() throws IOException {
            FSDataInputStream input = fs.open(path);
            try {
                return new DFSSeekableInputStream(input, createDFSStreamSizeProvider(fs, path, input));
            } catch (Throwable t) {
                Closeables.closeQuietly(input);
                Throwables.propagateIfInstanceOf(t, IOException.class);
                throw new IOException(t);
            }
        }
    };
}

From source file:co.cask.cdap.common.logging.SyncTest.java

License:Apache License

@Test
@Ignore//from   w  ww .j a  v a2 s . c  om
public void testSync() throws IOException {
    FileSystem fs = FileSystem.get(config);
    // create a file and write n bytes, then sync
    Path path = new Path("/myfile");
    FSDataOutputStream out = fs.create(path, false, 4096, (short) 2, 4096L);
    int numBytes = 5000;
    for (int i = 0; i < numBytes; i++) {
        out.write((byte) i);
    }
    out.hflush();
    // verify the file is there
    Assert.assertTrue(fs.exists(path));
    // do not verify the length of the file, hflush() does not update that
    //Assert.assertEquals(numBytes, fs.getFileStatus(path).getLen());
    // read back and verify all bytes
    FSDataInputStream in = fs.open(path);
    byte[] buffer = new byte[numBytes];
    in.readFully(buffer);
    for (int i = 0; i < numBytes; i++) {
        Assert.assertEquals((byte) i, buffer[i]);
    }
    in.close();
    // now close the writer
    out.close();
}

From source file:co.cask.hydrator.plugin.batch.action.FileAction.java

License:Apache License

@SuppressWarnings("ConstantConditions")
@Override/* w  ww .ja  v a2s .co  m*/
public void run(BatchActionContext context) throws Exception {
    if (!config.shouldRun(context)) {
        return;
    }
    config.substituteMacros(context);

    Job job = JobUtils.createInstance();
    Configuration conf = job.getConfiguration();
    FileSystem fileSystem = FileSystem.get(conf);
    Path[] paths;
    Path sourcePath = new Path(config.path);
    if (fileSystem.isDirectory(sourcePath)) {
        FileStatus[] status = fileSystem.listStatus(sourcePath);
        paths = FileUtil.stat2Paths(status);
    } else {
        paths = new Path[] { sourcePath };
    }

    //get regex pattern for file name filtering.
    boolean patternSpecified = !Strings.isNullOrEmpty(config.pattern);
    if (patternSpecified) {
        regex = Pattern.compile(config.pattern);
    }

    switch (config.action.toLowerCase()) {
    case "delete":
        for (Path path : paths) {
            if (!patternSpecified || isFileNameMatch(path.getName())) {
                fileSystem.delete(path, true);
            }
        }
        break;
    case "move":
        for (Path path : paths) {
            if (!patternSpecified || isFileNameMatch(path.getName())) {
                Path targetFileMovePath = new Path(config.targetFolder, path.getName());
                fileSystem.rename(path, targetFileMovePath);
            }
        }
        break;
    case "archive":
        for (Path path : paths) {
            if (!patternSpecified || isFileNameMatch(path.getName())) {
                try (FSDataOutputStream archivedStream = fileSystem
                        .create(new Path(config.targetFolder, path.getName() + ".zip"));
                        ZipOutputStream zipArchivedStream = new ZipOutputStream(archivedStream);
                        FSDataInputStream fdDataInputStream = fileSystem.open(path)) {
                    zipArchivedStream.putNextEntry(new ZipEntry(path.getName()));
                    int length;
                    byte[] buffer = new byte[1024];
                    while ((length = fdDataInputStream.read(buffer)) > 0) {
                        zipArchivedStream.write(buffer, 0, length);
                    }
                    zipArchivedStream.closeEntry();
                }
                fileSystem.delete(path, true);
            }
        }
        break;
    default:
        LOG.warn("No action required on the file.");
        break;
    }
}

From source file:co.cask.hydrator.plugin.batch.CopybookRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    // Get configuration
    Configuration conf = context.getConfiguration();
    int fileStructure = net.sf.JRecord.Common.Constants.IO_FIXED_LENGTH;
    Path path = new Path(conf.get(CopybookInputFormat.COPYBOOK_INPUTFORMAT_DATA_HDFS_PATH));
    FileSystem fs = FileSystem.get(path.toUri(), conf);
    // Create input stream for the COBOL copybook contents
    InputStream inputStream = IOUtils
            .toInputStream(conf.get(CopybookInputFormat.COPYBOOK_INPUTFORMAT_CBL_CONTENTS), "UTF-8");
    BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
    try {/*  w  w  w  .  j  ava  2 s.c  o m*/
        externalRecord = CopybookIOUtils.getExternalRecord(bufferedInputStream);
        recordByteLength = CopybookIOUtils.getRecordLength(externalRecord, fileStructure);

        LineProvider lineProvider = LineIOProvider.getInstance().getLineProvider(fileStructure,
                CopybookIOUtils.FONT);
        reader = LineIOProvider.getInstance().getLineReader(fileStructure, lineProvider);
        LayoutDetail copybook = CopybookIOUtils.getLayoutDetail(externalRecord);

        org.apache.hadoop.mapreduce.lib.input.FileSplit fileSplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) split;

        start = fileSplit.getStart();
        end = start + fileSplit.getLength();

        BufferedInputStream fileIn = new BufferedInputStream(fs.open(fileSplit.getPath()));
        // Jump to the point in the split at which the first complete record of the split starts,
        // if not the first InputSplit
        if (start != 0) {
            position = start - (start % recordByteLength) + recordByteLength;
            fileIn.skip(position);
        }
        reader.open(fileIn, copybook);

    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:co.cask.hydrator.plugin.db.batch.action.VerticaBulkImportAction.java

License:Apache License

@Override
public void run(ActionContext context) throws Exception {
    Object driver = Class.forName("com.vertica.jdbc.Driver").newInstance();
    DriverManager.registerDriver((Driver) driver);

    Preconditions.checkArgument(tableExists(config.tableName),
            "Table %s does not exist. Please check that the 'tableName' property "
                    + "has been set correctly, and that the connection string %s points to a valid database.",
            config.tableName, config.connectionString);

    String copyStatement;/*from   w  w w. j a va 2 s. c om*/

    if (config.level.equalsIgnoreCase("basic")) {
        // COPY tableName FROM STDIN DELIMITER 'delimiter'
        copyStatement = String.format("COPY %s FROM STDIN DELIMITER '%s'", config.tableName, config.delimiter);
    } else {
        copyStatement = config.copyStatement;
    }

    LOG.debug("Copy statement is: {}", copyStatement);

    try {
        try (Connection connection = DriverManager.getConnection(config.connectionString, config.user,
                config.password)) {
            connection.setAutoCommit(false);
            // run Copy statement
            VerticaCopyStream stream = new VerticaCopyStream((VerticaConnection) connection, copyStatement);
            // Keep running count of the number of rejects
            int totalRejects = 0;

            // start() starts the stream process, and opens the COPY command.
            stream.start();

            FileSystem fs = FileSystem.get(new Configuration());

            List<String> fileList = new ArrayList<>();
            FileStatus[] fileStatus;
            try {
                fileStatus = fs.listStatus(new Path(config.path));
                for (FileStatus fileStat : fileStatus) {
                    fileList.add(fileStat.getPath().toString());
                }
            } catch (FileNotFoundException e) {
                throw new IllegalArgumentException(String.format(String.format(
                        "Path %s not found on file system. Please provide correct path.", config.path), e));
            }

            if (fileStatus.length <= 0) {
                LOG.warn("No files available to load into vertica database");
            }

            for (String file : fileList) {
                Path path = new Path(file);

                FSDataInputStream inputStream = fs.open(path);
                // Add stream to the VerticaCopyStream
                stream.addStream(inputStream);

                // call execute() to load the newly added stream. You could
                // add many streams and call execute once to load them all.
                // Which method you choose depends mainly on whether you want
                // the ability to check the number of rejections as the load
                // progresses so you can stop if the number of rejects gets too
                // high. Also, high numbers of InputStreams could create a
                // resource issue on your client system.
                stream.execute();

                // Show any rejects from this execution of the stream load
                // getRejects() returns a List containing the
                // row numbers of rejected rows.
                List<Long> rejects = stream.getRejects();

                // The size of the list gives you the number of rejected rows.
                int numRejects = rejects.size();
                totalRejects += numRejects;
                if (config.autoCommit.equalsIgnoreCase("true")) {
                    // Commit the loaded data
                    connection.commit();
                }
            }

            // Finish closes the COPY command. It returns the number of
            // rows inserted.
            long results = stream.finish();

            context.getMetrics().gauge("num.of.rows.rejected", totalRejects);
            context.getMetrics().gauge("num.of.rows.inserted", results);

            // Commit the loaded data
            connection.commit();
        }
    } catch (Exception e) {
        throw new RuntimeException(String.format("Exception while running copy statement %s", copyStatement),
                e);
    } finally {
        DriverManager.deregisterDriver((Driver) driver);
    }
}

From source file:co.cask.tigon.io.Locations.java

License:Apache License

/**
 * Creates a new {@link com.google.common.io.InputSupplier} that can provides {@link SeekableInputStream} of
 * the given path.//from   w w  w  .java2 s.  c o  m
 *
 * @param fs The {@link org.apache.hadoop.fs.FileSystem} for the given path.
 * @param path The path to create {@link co.cask.tigon.io.SeekableInputStream} when requested.
 * @return A {@link com.google.common.io.InputSupplier}.
 */
public static InputSupplier<? extends SeekableInputStream> newInputSupplier(final FileSystem fs,
        final Path path) {
    return new InputSupplier<SeekableInputStream>() {
        @Override
        public SeekableInputStream getInput() throws IOException {
            return SeekableInputStream.create(fs.open(path));
        }
    };
}

From source file:co.nubetech.hiho.dedup.DelimitedLineRecordReader.java

License:Apache License

/**
 * //from  w  ww.j a  va  2  s .co m
 * @param delimiter
 * @param column
 * 
 * 
 */

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.delimiter = job.get(DelimitedTextInputFormat.DELIMITER_CONF);
    this.column = job.getInt(DelimitedTextInputFormat.COLUMN_CONF, 0);
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}