Example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodec createInputStream.

Prototype

CompressionInputStream createInputStream(InputStream in) throws IOException;

Source Link

Document

Create a CompressionInputStream that will read from the given input stream.

Usage

From source file:com.yolodata.tbana.hadoop.mapred.csv.CSVLineRecordReader.java

License:Open Source License

public void initialize(InputSplit genericSplit, JobConf conf) throws IOException {
    FileSplit split = (FileSplit) genericSplit;

    start = split.getStart();/*from   w w  w . j ava 2  s.com*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (codec != null) {
        is = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            fileIn.seek(start);
        }
        is = fileIn;
    }

    this.pos = start;
    init(is, conf);
}

From source file:cosmos.mapred.LongLineRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from w  w  w  .  j  ava  2s . co  m
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LfLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LfLineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:crunch.MaxTemperature.java

License:Apache License

public static void main(String[] args) throws Exception {
        String uri = args[0];/*from  w w w  . j  a  v a2s  .co m*/
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(uri), conf);

        Path inputPath = new Path(uri);
        CompressionCodecFactory factory = new CompressionCodecFactory(conf);
        CompressionCodec codec = factory.getCodec(inputPath);
        if (codec == null) {
            System.err.println("No codec found for " + uri);
            System.exit(1);
        }

        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());

        InputStream in = null;
        OutputStream out = null;
        try {
            in = codec.createInputStream(fs.open(inputPath));
            out = fs.create(new Path(outputUri));
            IOUtils.copyBytes(in, out, conf);
        } finally {
            IOUtils.closeStream(in);
            IOUtils.closeStream(out);
        }
    }

From source file:de.rwhq.hdfs.index.LineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();/* w w w . j  av  a2  s  .c  om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileReader.java

License:Apache License

/**
 * Opens a file for reading. If the filename ends in `.gz`, it is automatically decompressed
 * on the fly./*w w w. j  a  v a2s . c o m*/
 *
 * @param conf     The Hadoop configuration.
 * @param filePath The Hadoop path to the file that should be read.
 * @throws IOException I/O exception
 */
public WARCFileReader(Configuration conf, Path filePath) throws IOException {
    FileSystem fs = filePath.getFileSystem(conf);
    this.fileSize = fs.getFileStatus(filePath).getLen();
    logger.info("Reading from " + filePath);

    CompressionCodec codec = filePath.getName().endsWith(".gz") ? WARCFileWriter.getGzipCodec(conf) : null;
    byteStream = new CountingInputStream(new BufferedInputStream(fs.open(filePath)));
    dataStream = new DataInputStream(codec == null ? byteStream : codec.createInputStream(byteStream));
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.standalone.WarcBoilerplateRemoval.java

License:Apache License

public static void processWarcGzFile(File input, File outFile, boolean keepMinimalHtml) throws IOException {
    System.out.printf("Reading from %s, writing to %s%n", input, outFile);

    Configuration conf = new Configuration();
    // set limit to 100 GB (= almost unlimited)
    conf.setLong("warc.output.segment.size", WARCFileWriter.DEFAULT_MAX_SEGMENT_SIZE * 100);

    //Opens a file for reading.
    CompressionCodec codec = WARCFileWriter.getGzipCodec(conf);
    InputStream byteStream = new BufferedInputStream(new FileInputStream(input));
    DataInputStream dataStream = new DataInputStream(
            codec == null ? byteStream : codec.createInputStream(byteStream));

    BoilerPlateRemoval boilerPlateRemoval = new JusTextBoilerplateRemoval();

    long startTime = System.currentTimeMillis();
    int counter = 0;

    int recordsRead = 0;

    Path outputPath = new Path(outFile.getAbsolutePath());
    WARCFileWriter warcFileWriter = new WARCFileWriter(conf, codec, outputPath);

    // detecting the correct charset
    final CharsetDetector charsetDetector = new ICUCharsetDetectorWrapper();

    while (true) {
        try {/*from ww  w  .  j  a  va 2s . co m*/
            //Reads the next record from the file.
            WARCRecord wc = new WARCRecord(dataStream);

            // detect charset
            byte[] bytes = wc.getContent();
            Charset charset = charsetDetector.detectCharset(bytes);

            String html = new String(bytes, charset);

            // strip HTTP header
            html = html.substring(html.indexOf("\r\n\r\n") + 4);

            String plainText;
            if (keepMinimalHtml) {
                plainText = boilerPlateRemoval.getMinimalHtml(html, null);
            } else {
                plainText = boilerPlateRemoval.getPlainText(html, null);
            }

            counter++;
            if (counter % 100 == 0) {
                System.out.printf(Locale.ENGLISH, "~%.1f entries per second%n",
                        counter * 1000f / (double) (System.currentTimeMillis() - startTime));
                System.out.printf(Locale.ENGLISH, "%d records processed%n", recordsRead);
            }

            recordsRead++;

            // create copy of WarcRecord
            WARCRecord newWarcRecord = new WARCRecord(wc);
            newWarcRecord.setContent(plainText);

            warcFileWriter.write(newWarcRecord);
        } catch (EOFException e) {
            break;
        }
    }

    warcFileWriter.close();

    // rename from out.warc.gz.seg-00000.warc.gz to out.warc.gz
    File actualOutputFile = new File(outFile.getAbsolutePath() + ".seg-00000.warc.gz");
    if (!actualOutputFile.exists()) {
        throw new IOException("File " + actualOutputFile + " does not exist");
    }
    if (!actualOutputFile.renameTo(outFile)) {
        throw new IOException("Renaming file " + actualOutputFile + " to " + outFile + " failed");
    }

    // delete .crc file
    File crcFile = new File(actualOutputFile.getParentFile(), "." + actualOutputFile.getName() + ".crc");
    if (!crcFile.delete()) {
        throw new IOException(crcFile + " was not deleted");
    }

    System.out.printf(Locale.ENGLISH, "%d records written to %s, total time %f%n", recordsRead,
            outFile.getName(), counter * 1000f / (double) (System.currentTimeMillis() - startTime));
}

From source file:edu.cmu.cs.in.hadoop.HoopWholeFileRecordReader.java

License:Open Source License

public HoopWholeFileRecordReader(JobConf aJob, InputSplit aSplit) {
    setClassName("HoopWholeFileRecordReader");
    debug("HoopWholeFileRecordReader ()");

    job = aJob;// w ww.  j a va 2s.  c  o m

    FileSplit split = (FileSplit) aSplit;

    //this.maxLineLength=job.getInt ("mapred.linerecordreader.maxlength",Integer.MAX_VALUE);

    fileSize = split.getLength();

    final Path file = split.getPath();

    createKeyFromName(file.getName());

    debug("File/Key: " + internalKey + " with size: " + split.getLength());

    compressionCodecs = new CompressionCodecFactory(job);

    final CompressionCodec codec = compressionCodecs.getCodec(file);

    FileSystem fs = null;
    try {
        fs = file.getFileSystem(job);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    FSDataInputStream fileIn = null;

    try {
        fileIn = fs.open(split.getPath());
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    if (codec != null) {
        try {
            inStream = codec.createInputStream(fileIn);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    } else {

        inStream = fileIn;
    }
}

From source file:edu.uci.ics.hyracks.imru.file.HDFSUtils.java

License:Apache License

/**
 * Open a file in HDFS for reading, performing automatic
 * decompression as necessary.//ww w . j a va2s .  co  m
 *
 * @param dfs
 *            The HDFS file system object.
 * @param conf
 *            The HDFS configuration.
 * @param path
 *            The path to the file.
 * @return An InputStream for reading the file.
 * @throws IOException
 */
public static InputStream open(FileSystem dfs, Configuration conf, Path path) throws IOException {
    FSDataInputStream fin = dfs.open(path);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = compressionCodecs.getCodec(path);
    if (codec != null) {
        return codec.createInputStream(fin);
    } else {
        return fin;
    }
}

From source file:format.OverlapRecordReader.java

License:BSD License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start = split.getStart();/*from   ww  w.j a  v  a2 s . com*/
    end = start + split.getLength();
    final Path file = split.getPath();
    //Configuration job = HadoopUtils.getConfiguration(context);
    Configuration job = context.getConfiguration();
    maxLineLen = job.getInt(MAX_LINE_LEN_CONF, Integer.MAX_VALUE);

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("Codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    in = new LineReader(codec.createInputStream(fileIn), job);

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}

From source file:gobblin.source.extractor.hadoop.HadoopFsHelper.java

License:Apache License

/**
 * Returns an {@link InputStream} to the specified file.
 * <p>//from w w w.j a  v a  2s .  co  m
 * Note: It is the caller's responsibility to close the returned {@link InputStream}.
 * </p>
 *
 * @param path The path to the file to open.
 * @return An {@link InputStream} for the specified file.
 * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file.
 */
@Override
public InputStream getFileStream(String path) throws FileBasedHelperException {
    try {
        Path p = new Path(path);
        InputStream in = this.getFileSystem().open(p);
        // Account for compressed files (e.g. gzip).
        // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala
        CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf());
        CompressionCodec codec = factory.getCodec(p);
        return (codec == null) ? in : codec.createInputStream(in);
    } catch (IOException e) {
        throw new FileBasedHelperException("Cannot open file " + path + " due to " + e.getMessage(), e);
    }
}