List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream
CompressionInputStream createInputStream(InputStream in) throws IOException;
From source file:gov.jgi.meta.hadoop.input.FastaBlockRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { LOG.info("initializing FastaBlockRecordReader"); FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();// w w w. ja v a2 s .c o m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new FastaBlockLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = false; // don't do this! //--start; or this fileIn.seek(start); } in = new FastaBlockLineReader(fileIn, job); } this.pos = start; }
From source file:gov.jgi.meta.hadoop.input.FastaRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();/*from w w w . j a v a 2 s . c om*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new FastaLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = false; // don't do this! //--start; or this fileIn.seek(start); } in = new FastaLineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:gov.jgi.meta.hadoop.input.FastqBlockRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from ww w .ja va 2 s.c o m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new FastqBlockLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = false; // don't do this! //--start; or this fileIn.seek(start); } in = new FastqBlockLineReader(fileIn, job); } this.pos = start; }
From source file:gov.jgi.meta.hadoop.input.FastqRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();// w w w. ja v a2 s. co m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new FastqLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = false; // don't do this! //--start; or this fileIn.seek(start); } in = new FastqLineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:gr.ntua.h2rdf.inputFormat.MyLineRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { MyFileSplit split = (MyFileSplit) (MyInputSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//w w w. j a va2 s . c om end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:gr.ntua.h2rdf.loadTriples.TranslateAndImport.java
License:Apache License
public Job createSubmittableJob(String[] args) throws IOException, ClassNotFoundException { //compute sample partitions FileSystem fs;/* w w w.j a v a2 s . com*/ Configuration conf = new Configuration(); int collected = 0, chunks = 0; try { fs = FileSystem.get(conf); Path sampleDir = new Path("sample"); FileStatus[] samples = fs.listStatus(sampleDir); TreeSet<String> set = new TreeSet<String>(); for (FileStatus sample : samples) { FSDataInputStream in = fs.open(sample.getPath()); CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(GzipCodec.class, conf); CompressionInputStream in1 = codec.createInputStream(in); NxParser nxp = new NxParser(in1); Iterator<Node[]> it = nxp.iterator(); while (it.hasNext()) { Node[] tr = it.next(); //System.out.println(tr[0].toN3()); set.add(tr[0].toN3()); set.add(tr[1].toN3()); set.add(tr[2].toN3()); } in1.close(); in.close(); } IndexTranslator translator = new IndexTranslator(TABLE_NAME + "_Index"); HashMap<String, Long> index = translator.translate(set); set.clear(); TreeSet<ImmutableBytesWritable> set1 = new TreeSet<ImmutableBytesWritable>( new ImmutableBytesWritable.Comparator()); for (FileStatus sample : samples) { FSDataInputStream in = fs.open(sample.getPath()); CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(GzipCodec.class, conf); CompressionInputStream in1 = codec.createInputStream(in); NxParser nxp = new NxParser(in1); Iterator<Node[]> it = nxp.iterator(); while (it.hasNext()) { Node[] tr = it.next(); ByteTriple btr = new ByteTriple(index.get(tr[0].toN3()), index.get(tr[1].toN3()), index.get(tr[2].toN3())); set1.add(new ImmutableBytesWritable(btr.getSPOByte())); set1.add(new ImmutableBytesWritable(btr.getSOPByte())); set1.add(new ImmutableBytesWritable(btr.getOPSByte())); set1.add(new ImmutableBytesWritable(btr.getOSPByte())); set1.add(new ImmutableBytesWritable(btr.getPOSByte())); set1.add(new ImmutableBytesWritable(btr.getPSOByte())); } in1.close(); in.close(); } index.clear(); Path p = new Path("hexastorePartition"); if (fs.exists(p)) { fs.delete(p, true); } SequenceFile.Writer partitionWriter = SequenceFile.createWriter(fs, conf, p, ImmutableBytesWritable.class, NullWritable.class); double chunkSize = bucketSampledTriples * DistinctIds.samplingRate; System.out.println("chunkSize: " + chunkSize); Iterator<ImmutableBytesWritable> it = set1.iterator(); while (it.hasNext()) { ImmutableBytesWritable key = it.next(); if (collected > chunkSize) { partitionWriter.append(key, NullWritable.get()); //System.out.println(Bytes.toStringBinary(key.get())); collected = 0; chunks++; } else { collected++; } } System.out.println("chunks: " + chunks); partitionWriter.close(); } catch (IOException e) { e.printStackTrace(); } Job job = new Job(); job = new Job(conf, "Import Hexastore"); FileInputFormat.setInputPaths(job, new Path(args[0])); Path out = new Path("out"); try { fs = FileSystem.get(conf); if (fs.exists(out)) { fs.delete(out, true); } } catch (IOException e) { e.printStackTrace(); } FileOutputFormat.setOutputPath(job, out); job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("hexastorePartition")); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(HFileOutputFormat.class); StringBuilder compressionConfigValue = new StringBuilder(); compressionConfigValue.append(URLEncoder.encode("I", "UTF-8")); compressionConfigValue.append('='); compressionConfigValue.append(URLEncoder.encode(Algorithm.SNAPPY.getName(), "UTF-8")); compressionConfigValue.append('&'); compressionConfigValue.append(URLEncoder.encode("S", "UTF-8")); compressionConfigValue.append('='); compressionConfigValue.append(URLEncoder.encode(Algorithm.SNAPPY.getName(), "UTF-8")); compressionConfigValue.append('&'); compressionConfigValue.append(URLEncoder.encode("T", "UTF-8")); compressionConfigValue.append('='); compressionConfigValue.append(URLEncoder.encode(Algorithm.SNAPPY.getName(), "UTF-8")); job.getConfiguration().set("hbase.hfileoutputformat.families.compression", compressionConfigValue.toString()); //job.getConfiguration().setInt("hbase.mapreduce.hfileoutputformat.blocksize",262144); //job.getConfiguration().setInt("hbase.mapreduce.hfileoutputformat.blocksize",16384); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); job.setJarByClass(TranslateAndImport.class); job.setMapperClass(Map.class); //job.setReducerClass(HexaStoreHistogramsReduce.class); job.setReducerClass(HexaStoreReduce.class); job.getConfiguration().set("h2rdf.tableName", TABLE_NAME); job.getConfiguration().setInt("mapred.reduce.tasks", chunks + 1); //job.setCombinerClass(Combiner.class); job.setJobName("Translate Projections"); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().setInt("io.sort.mb", 100); job.getConfiguration().setInt("io.file.buffer.size", 131072); job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1); job.getConfiguration().set("mapred.compress.map.output", "true"); job.getConfiguration().set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //job.getConfiguration().setInt("hbase.hregion.max.filesize", 268435456); //job.getConfiguration().setInt("hbase.hregion.max.filesize", 67108864); job.getConfiguration().setInt("hbase.hregion.max.filesize", 33554432); return job; }
From source file:hdfsIO.fileInteractions.java
public List<String> readLines(Path location, Configuration conf) throws Exception { FileSystem fileSystem = FileSystem.get(location.toUri(), conf); CompressionCodecFactory factory = new CompressionCodecFactory(conf); FileStatus[] items = fileSystem.listStatus(location); if (items == null) { return new ArrayList<String>(); }/*from w w w .j a va 2 s.c o m*/ List<String> results = new ArrayList<String>(); for (FileStatus item : items) { // ignoring files like _SUCCESS if (item.getPath().getName().startsWith("_")) { continue; } CompressionCodec codec = factory.getCodec(item.getPath()); InputStream stream = null; // check if we have a compression codec we need to use if (codec != null) { stream = codec.createInputStream(fileSystem.open(item.getPath())); } else { stream = fileSystem.open(item.getPath()); } StringWriter writer = new StringWriter(); IOUtils.copy(stream, writer, "UTF-8"); String raw = writer.toString(); String[] resulting = raw.split("\n"); for (String str : raw.split("\n")) { results.add(str); } } return results; }
From source file:InvertedIndex.NLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.job = job; this.context = context; this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();/*from www .j a v a 2 s . c o m*/ end = start + split.getLength(); final Path file = split.getPath(); this.path = file; this.length = split.getLength(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { if (0 == split.getLength() && job.getBoolean("mapred.ignore.badcompress", false)) { if (null != context && context instanceof TaskInputOutputContext) { ((TaskInputOutputContext) context).getCounter("Input Counter", "Gzip File length is zero") .increment(1); } if (null != this.path) { LOG.warn("Skip 0-length Zip file: " + this.path.toString()); } in = new NLineReader(fileIn, job); } else { try { in = new NLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } catch (IOException e) { if (isIgnoreBadCompress(job, e)) { in = new NLineReader(fileIn, job); end = start; LOG.warn("Skip Bad Compress File: " + this.path.toString()); LOG.warn("initialize line read error", e); ((TaskInputOutputContext) context).getCounter("Input Counter", "Skip Bad Zip File") .increment(1); ((TaskInputOutputContext) context).getCounter("Input Counter", "Total Skip Bad Zip Length") .increment(this.length); } else { throw e; } } } } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new NLineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:io.aos.hdfs.FileDecompressor.java
License:Apache License
public static void main(String... args) throws Exception { String uri = args[0];//from ww w . j a v a2 s .co m Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); Path inputPath = new Path(uri); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(inputPath); if (codec == null) { System.err.println("No codec found for " + uri); System.exit(1); } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); InputStream in = null; OutputStream out = null; try { in = codec.createInputStream(fs.open(inputPath)); out = fs.create(new Path(outputUri)); IOUtils.copyBytes(in, out, conf); } finally { IOUtils.closeStream(in); IOUtils.closeStream(out); } }
From source file:io.druid.indexer.Utils.java
License:Apache License
public static InputStream openInputStream(JobContext job, Path inputPath, final FileSystem fileSystem) throws IOException { if (!FileOutputFormat.getCompressOutput(job)) { return fileSystem.open(inputPath); } else {/*from w w w. j a v a 2 s .co m*/ Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job, GzipCodec.class); CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration()); inputPath = new Path(inputPath.toString() + codec.getDefaultExtension()); return codec.createInputStream(fileSystem.open(inputPath)); } }