Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:com.yolodata.tbana.hadoop.mapred.csv.CSVLineRecordReader.java

License:Open Source License

public void initialize(InputSplit genericSplit, JobConf conf) throws IOException {
    FileSplit split = (FileSplit) genericSplit;

    start = split.getStart();/*from   w w  w . j ava 2  s.com*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (codec != null) {
        is = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            fileIn.seek(start);
        }
        is = fileIn;
    }

    this.pos = start;
    init(is, conf);
}

From source file:cosmos.mapred.LongLineRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from w  w  w  .  j  ava  2s . co  m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LfLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LfLineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:crunch.MaxTemperature.java

License:Apache License

public static void main(String[] args) throws Exception {
        String uri = args[0];/*from  w w w  . j  a  v a2s  .co m*/
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(uri), conf);

        Path inputPath = new Path(uri);
        CompressionCodecFactory factory = new CompressionCodecFactory(conf);
        CompressionCodec codec = factory.getCodec(inputPath);
        if (codec == null) {
            System.err.println("No codec found for " + uri);
            System.exit(1);
        }

        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());

        InputStream in = null;
        OutputStream out = null;
        try {
            in = codec.createInputStream(fs.open(inputPath));
            out = fs.create(new Path(outputUri));
            IOUtils.copyBytes(in, out, conf);
        } finally {
            IOUtils.closeStream(in);
            IOUtils.closeStream(out);
        }
    }

From source file:de.rwhq.hdfs.index.LineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/* w w w . j  av  a2  s  .c  om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileReader.java

License:Apache License

/**
 * Opens a file for reading. If the filename ends in `.gz`, it is automatically decompressed
 * on the fly./*w w w. j  a  v a2s . c o m*/
 *
 * @param conf     The Hadoop configuration.
 * @param filePath The Hadoop path to the file that should be read.
 * @throws IOException I/O exception
 */
public WARCFileReader(Configuration conf, Path filePath) throws IOException {
    FileSystem fs = filePath.getFileSystem(conf);
    this.fileSize = fs.getFileStatus(filePath).getLen();
    logger.info("Reading from " + filePath);

    CompressionCodec codec = filePath.getName().endsWith(".gz") ? WARCFileWriter.getGzipCodec(conf) : null;
    byteStream = new CountingInputStream(new BufferedInputStream(fs.open(filePath)));
    dataStream = new DataInputStream(codec == null ? byteStream : codec.createInputStream(byteStream));
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.standalone.WarcBoilerplateRemoval.java

License:Apache License

public static void processWarcGzFile(File input, File outFile, boolean keepMinimalHtml) throws IOException {
    System.out.printf("Reading from %s, writing to %s%n", input, outFile);

    Configuration conf = new Configuration();
    // set limit to 100 GB (= almost unlimited)
    conf.setLong("warc.output.segment.size", WARCFileWriter.DEFAULT_MAX_SEGMENT_SIZE * 100);

    //Opens a file for reading.
    CompressionCodec codec = WARCFileWriter.getGzipCodec(conf);
    InputStream byteStream = new BufferedInputStream(new FileInputStream(input));
    DataInputStream dataStream = new DataInputStream(
            codec == null ? byteStream : codec.createInputStream(byteStream));

    BoilerPlateRemoval boilerPlateRemoval = new JusTextBoilerplateRemoval();

    long startTime = System.currentTimeMillis();
    int counter = 0;

    int recordsRead = 0;

    Path outputPath = new Path(outFile.getAbsolutePath());
    WARCFileWriter warcFileWriter = new WARCFileWriter(conf, codec, outputPath);

    // detecting the correct charset
    final CharsetDetector charsetDetector = new ICUCharsetDetectorWrapper();

    while (true) {
        try {/*from ww  w  .  j  a  va 2s . co m*/
            //Reads the next record from the file.
            WARCRecord wc = new WARCRecord(dataStream);

            // detect charset
            byte[] bytes = wc.getContent();
            Charset charset = charsetDetector.detectCharset(bytes);

            String html = new String(bytes, charset);

            // strip HTTP header
            html = html.substring(html.indexOf("\r\n\r\n") + 4);

            String plainText;
            if (keepMinimalHtml) {
                plainText = boilerPlateRemoval.getMinimalHtml(html, null);
            } else {
                plainText = boilerPlateRemoval.getPlainText(html, null);
            }

            counter++;
            if (counter % 100 == 0) {
                System.out.printf(Locale.ENGLISH, "~%.1f entries per second%n",
                        counter * 1000f / (double) (System.currentTimeMillis() - startTime));
                System.out.printf(Locale.ENGLISH, "%d records processed%n", recordsRead);
            }

            recordsRead++;

            // create copy of WarcRecord
            WARCRecord newWarcRecord = new WARCRecord(wc);
            newWarcRecord.setContent(plainText);

            warcFileWriter.write(newWarcRecord);
        } catch (EOFException e) {
            break;
        }
    }

    warcFileWriter.close();

    // rename from out.warc.gz.seg-00000.warc.gz to out.warc.gz
    File actualOutputFile = new File(outFile.getAbsolutePath() + ".seg-00000.warc.gz");
    if (!actualOutputFile.exists()) {
        throw new IOException("File " + actualOutputFile + " does not exist");
    }
    if (!actualOutputFile.renameTo(outFile)) {
        throw new IOException("Renaming file " + actualOutputFile + " to " + outFile + " failed");
    }

    // delete .crc file
    File crcFile = new File(actualOutputFile.getParentFile(), "." + actualOutputFile.getName() + ".crc");
    if (!crcFile.delete()) {
        throw new IOException(crcFile + " was not deleted");
    }

    System.out.printf(Locale.ENGLISH, "%d records written to %s, total time %f%n", recordsRead,
            outFile.getName(), counter * 1000f / (double) (System.currentTimeMillis() - startTime));
}

From source file:edu.cmu.cs.in.hadoop.HoopWholeFileRecordReader.java

License:Open Source License

public HoopWholeFileRecordReader(JobConf aJob, InputSplit aSplit) {
    setClassName("HoopWholeFileRecordReader");
    debug("HoopWholeFileRecordReader ()");

    job = aJob;// w ww.  j a va 2s.  c  o m

    FileSplit split = (FileSplit) aSplit;

    //this.maxLineLength=job.getInt ("mapred.linerecordreader.maxlength",Integer.MAX_VALUE);

    fileSize = split.getLength();

    final Path file = split.getPath();

    createKeyFromName(file.getName());

    debug("File/Key: " + internalKey + " with size: " + split.getLength());

    compressionCodecs = new CompressionCodecFactory(job);

    final CompressionCodec codec = compressionCodecs.getCodec(file);

    FileSystem fs = null;
    try {
        fs = file.getFileSystem(job);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    FSDataInputStream fileIn = null;

    try {
        fileIn = fs.open(split.getPath());
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    if (codec != null) {
        try {
            inStream = codec.createInputStream(fileIn);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    } else {

        inStream = fileIn;
    }
}

From source file:edu.uci.ics.hyracks.imru.file.HDFSUtils.java

License:Apache License

/**
 * Open a file in HDFS for reading, performing automatic
 * decompression as necessary.//ww w . j a va2s .  co  m
 *
 * @param dfs
 *            The HDFS file system object.
 * @param conf
 *            The HDFS configuration.
 * @param path
 *            The path to the file.
 * @return An InputStream for reading the file.
 * @throws IOException
 */
public static InputStream open(FileSystem dfs, Configuration conf, Path path) throws IOException {
    FSDataInputStream fin = dfs.open(path);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = compressionCodecs.getCodec(path);
    if (codec != null) {
        return codec.createInputStream(fin);
    } else {
        return fin;
    }
}

From source file:format.OverlapRecordReader.java

License:BSD License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start = split.getStart();/*from   ww  w.j a  v  a2 s . com*/
    end = start + split.getLength();
    final Path file = split.getPath();
    //Configuration job = HadoopUtils.getConfiguration(context);
    Configuration job = context.getConfiguration();
    maxLineLen = job.getInt(MAX_LINE_LEN_CONF, Integer.MAX_VALUE);

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("Codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    in = new LineReader(codec.createInputStream(fileIn), job);

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}

From source file:gobblin.source.extractor.hadoop.HadoopFsHelper.java

License:Apache License

/**
 * Returns an {@link InputStream} to the specified file.
 * <p>//from w w w.j a  v a  2s .  co  m
 * Note: It is the caller's responsibility to close the returned {@link InputStream}.
 * </p>
 *
 * @param path The path to the file to open.
 * @return An {@link InputStream} for the specified file.
 * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file.
 */
@Override
public InputStream getFileStream(String path) throws FileBasedHelperException {
    try {
        Path p = new Path(path);
        InputStream in = this.getFileSystem().open(p);
        // Account for compressed files (e.g. gzip).
        // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala
        CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf());
        CompressionCodec codec = factory.getCodec(p);
        return (codec == null) ? in : codec.createInputStream(in);
    } catch (IOException e) {
        throw new FileBasedHelperException("Cannot open file " + path + " due to " + e.getMessage(), e);
    }
}