List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream
CompressionInputStream createInputStream(InputStream in) throws IOException;
From source file:com.yolodata.tbana.hadoop.mapred.csv.CSVLineRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, JobConf conf) throws IOException { FileSplit split = (FileSplit) genericSplit; start = split.getStart();/*from w w w . j ava 2 s.com*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(split.getPath()); if (codec != null) { is = codec.createInputStream(fileIn); end = Long.MAX_VALUE; } else { if (start != 0) { fileIn.seek(start); } is = fileIn; } this.pos = start; init(is, conf); }
From source file:cosmos.mapred.LongLineRecordReader.java
License:Apache License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from w w w . j ava 2s . co m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LfLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LfLineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:crunch.MaxTemperature.java
License:Apache License
public static void main(String[] args) throws Exception { String uri = args[0];/*from w w w . j a v a2s .co m*/ Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); Path inputPath = new Path(uri); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(inputPath); if (codec == null) { System.err.println("No codec found for " + uri); System.exit(1); } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); InputStream in = null; OutputStream out = null; try { in = codec.createInputStream(fs.open(inputPath)); out = fs.create(new Path(outputUri)); IOUtils.copyBytes(in, out, conf); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } }
From source file:de.rwhq.hdfs.index.LineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();/* w w w . j av a2 s .c om*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.io.WARCFileReader.java
License:Apache License
/** * Opens a file for reading. If the filename ends in `.gz`, it is automatically decompressed * on the fly./*w w w. j a v a2s . c o m*/ * * @param conf The Hadoop configuration. * @param filePath The Hadoop path to the file that should be read. * @throws IOException I/O exception */ public WARCFileReader(Configuration conf, Path filePath) throws IOException { FileSystem fs = filePath.getFileSystem(conf); this.fileSize = fs.getFileStatus(filePath).getLen(); logger.info("Reading from " + filePath); CompressionCodec codec = filePath.getName().endsWith(".gz") ? WARCFileWriter.getGzipCodec(conf) : null; byteStream = new CountingInputStream(new BufferedInputStream(fs.open(filePath))); dataStream = new DataInputStream(codec == null ? byteStream : codec.createInputStream(byteStream)); }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.standalone.WarcBoilerplateRemoval.java
License:Apache License
public static void processWarcGzFile(File input, File outFile, boolean keepMinimalHtml) throws IOException { System.out.printf("Reading from %s, writing to %s%n", input, outFile); Configuration conf = new Configuration(); // set limit to 100 GB (= almost unlimited) conf.setLong("warc.output.segment.size", WARCFileWriter.DEFAULT_MAX_SEGMENT_SIZE * 100); //Opens a file for reading. CompressionCodec codec = WARCFileWriter.getGzipCodec(conf); InputStream byteStream = new BufferedInputStream(new FileInputStream(input)); DataInputStream dataStream = new DataInputStream( codec == null ? byteStream : codec.createInputStream(byteStream)); BoilerPlateRemoval boilerPlateRemoval = new JusTextBoilerplateRemoval(); long startTime = System.currentTimeMillis(); int counter = 0; int recordsRead = 0; Path outputPath = new Path(outFile.getAbsolutePath()); WARCFileWriter warcFileWriter = new WARCFileWriter(conf, codec, outputPath); // detecting the correct charset final CharsetDetector charsetDetector = new ICUCharsetDetectorWrapper(); while (true) { try {/*from ww w . j a va 2s . co m*/ //Reads the next record from the file. WARCRecord wc = new WARCRecord(dataStream); // detect charset byte[] bytes = wc.getContent(); Charset charset = charsetDetector.detectCharset(bytes); String html = new String(bytes, charset); // strip HTTP header html = html.substring(html.indexOf("\r\n\r\n") + 4); String plainText; if (keepMinimalHtml) { plainText = boilerPlateRemoval.getMinimalHtml(html, null); } else { plainText = boilerPlateRemoval.getPlainText(html, null); } counter++; if (counter % 100 == 0) { System.out.printf(Locale.ENGLISH, "~%.1f entries per second%n", counter * 1000f / (double) (System.currentTimeMillis() - startTime)); System.out.printf(Locale.ENGLISH, "%d records processed%n", recordsRead); } recordsRead++; // create copy of WarcRecord WARCRecord newWarcRecord = new WARCRecord(wc); newWarcRecord.setContent(plainText); warcFileWriter.write(newWarcRecord); } catch (EOFException e) { break; } } warcFileWriter.close(); // rename from out.warc.gz.seg-00000.warc.gz to out.warc.gz File actualOutputFile = new File(outFile.getAbsolutePath() + ".seg-00000.warc.gz"); if (!actualOutputFile.exists()) { throw new IOException("File " + actualOutputFile + " does not exist"); } if (!actualOutputFile.renameTo(outFile)) { throw new IOException("Renaming file " + actualOutputFile + " to " + outFile + " failed"); } // delete .crc file File crcFile = new File(actualOutputFile.getParentFile(), "." + actualOutputFile.getName() + ".crc"); if (!crcFile.delete()) { throw new IOException(crcFile + " was not deleted"); } System.out.printf(Locale.ENGLISH, "%d records written to %s, total time %f%n", recordsRead, outFile.getName(), counter * 1000f / (double) (System.currentTimeMillis() - startTime)); }
From source file:edu.cmu.cs.in.hadoop.HoopWholeFileRecordReader.java
License:Open Source License
public HoopWholeFileRecordReader(JobConf aJob, InputSplit aSplit) { setClassName("HoopWholeFileRecordReader"); debug("HoopWholeFileRecordReader ()"); job = aJob;// w ww. j a va 2s. c o m FileSplit split = (FileSplit) aSplit; //this.maxLineLength=job.getInt ("mapred.linerecordreader.maxlength",Integer.MAX_VALUE); fileSize = split.getLength(); final Path file = split.getPath(); createKeyFromName(file.getName()); debug("File/Key: " + internalKey + " with size: " + split.getLength()); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); FileSystem fs = null; try { fs = file.getFileSystem(job); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } FSDataInputStream fileIn = null; try { fileIn = fs.open(split.getPath()); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (codec != null) { try { inStream = codec.createInputStream(fileIn); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else { inStream = fileIn; } }
From source file:edu.uci.ics.hyracks.imru.file.HDFSUtils.java
License:Apache License
/** * Open a file in HDFS for reading, performing automatic * decompression as necessary.//ww w . j a va2s . co m * * @param dfs * The HDFS file system object. * @param conf * The HDFS configuration. * @param path * The path to the file. * @return An InputStream for reading the file. * @throws IOException */ public static InputStream open(FileSystem dfs, Configuration conf, Path path) throws IOException { FSDataInputStream fin = dfs.open(path); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = compressionCodecs.getCodec(path); if (codec != null) { return codec.createInputStream(fin); } else { return fin; } }
From source file:format.OverlapRecordReader.java
License:BSD License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; start = split.getStart();/*from ww w.j a v a2 s . com*/ end = start + split.getLength(); final Path file = split.getPath(); //Configuration job = HadoopUtils.getConfiguration(context); Configuration job = context.getConfiguration(); maxLineLen = job.getInt(MAX_LINE_LEN_CONF, Integer.MAX_VALUE); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("Codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header in = new LineReader(codec.createInputStream(fileIn), job); if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:gobblin.source.extractor.hadoop.HadoopFsHelper.java
License:Apache License
/** * Returns an {@link InputStream} to the specified file. * <p>//from w w w.j a v a 2s . co m * Note: It is the caller's responsibility to close the returned {@link InputStream}. * </p> * * @param path The path to the file to open. * @return An {@link InputStream} for the specified file. * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file. */ @Override public InputStream getFileStream(String path) throws FileBasedHelperException { try { Path p = new Path(path); InputStream in = this.getFileSystem().open(p); // Account for compressed files (e.g. gzip). // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf()); CompressionCodec codec = factory.getCodec(p); return (codec == null) ? in : codec.createInputStream(in); } catch (IOException e) { throw new FileBasedHelperException("Cannot open file " + path + " due to " + e.getMessage(), e); } }