List of usage examples for org.apache.hadoop.mapreduce TaskAttemptContext getConfiguration
public Configuration getConfiguration();
From source file:gov.jgi.meta.hadoop.input.FastqBlockRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();/* w ww .java 2 s . c o m*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new FastqBlockLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = false; // don't do this! //--start; or this fileIn.seek(start); } in = new FastqBlockLineReader(fileIn, job); } this.pos = start; }
From source file:gov.jgi.meta.hadoop.input.FastqRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();// w w w . j a va2 s . c o m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new FastqLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = false; // don't do this! //--start; or this fileIn.seek(start); } in = new FastqLineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:gov.jgi.meta.hadoop.output.FastaOutputFormat.java
License:Open Source License
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); CompressionCodec codec = null;/* w w w . j a va 2 s. c om*/ String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); } Path file = getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); return (new FastaRecordWriter<K, V>(fileOut)); } else { FSDataOutputStream fileOut = fs.create(file, false); return (new FastaRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)))); } }
From source file:gov.llnl.ontology.text.hbase.XMLRecordReader.java
License:Open Source License
/** * Extract the {@link Path} for the file to be processed by this {@link * XMLRecordReader}./*w ww. j a v a 2s . c o m*/ */ public void initialize(InputSplit isplit, TaskAttemptContext context) throws IOException, InterruptedException { Configuration config = context.getConfiguration(); // Get the file stream for the xml file. FileSplit split = (FileSplit) isplit; Path file = split.getPath(); FileSystem fs = file.getFileSystem(config); fsin = (useGzip) ? new GZIPInputStream(fs.open(split.getPath())) : fs.open(split.getPath()); fsin = new BufferedInputStream(fsin); // Setup the limits of the xml file. start = split.getStart(); end = start + split.getLength(); pos = 0; // Get the xml document delmiters for this xml file. if (!config.get(DELIMITER_TAG).equals("")) { startTag = ("<" + config.get(DELIMITER_TAG)).getBytes(); endTag = ("</" + config.get(DELIMITER_TAG) + ">").getBytes(); } else { String fileNameBase = file.getName().replace(".xml", ""); startTag = ("<" + fileNameBase).getBytes(); endTag = ("</" + fileNameBase).getBytes(); } context.setStatus(file.getName() + " " + pos + " " + end); }
From source file:gr.ntua.h2rdf.inputFormat.HFileRecordReaderBufferedScan.java
License:Open Source License
/** * Initializes the reader.// w w w . j a va2 s . c o m * * @param inputsplit The split to work with. * @param context The current task context. * @throws IOException When setting up the reader fails. * @throws InterruptedException When the job is aborted. * @see org.apache.hadoop.mapreduce.RecordReader#initialize( * org.apache.hadoop.mapreduce.InputSplit, * org.apache.hadoop.mapreduce.TaskAttemptContext) */ @Override public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException, InterruptedException { tsplit = (TableColumnSplit) inputsplit; scan = new Scan(); /*byte[] rowid =tsplit.getStartRow(); byte[] startr = new byte[19]; byte[] stopr = new byte[19]; for (int i = 0; i < rowid.length; i++) { startr[i] =rowid[i]; stopr[i] =rowid[i]; } if (rowid.length==18) { startr[18] =(byte)0; stopr[18] =(byte)MyNewTotalOrderPartitioner.MAX_HBASE_BUCKETS; } if (rowid.length==10) { for (int i = 10; i < startr.length-1; i++) { startr[i] =(byte)0; stopr[i] =(byte)255; } startr[startr.length-1] =(byte)0; stopr[startr.length-1] =(byte)MyNewTotalOrderPartitioner.MAX_HBASE_BUCKETS; }*/ scan.setStartRow(tsplit.getStartRow()); scan.setStopRow(tsplit.getStopRow()); scan.setCaching(30000); //good for mapreduce scan scan.setCacheBlocks(false); //good for mapreduce scan scan.setBatch(30000); //good for mapreduce scan byte[] a, bid = null; a = Bytes.toBytes("A"); bid = new byte[a.length]; for (int i = 0; i < a.length; i++) { bid[i] = a[i]; } //System.out.println(Bytes.toStringBinary(bid)); scan.addFamily(bid); HTable table = new HTable(HBconf, tsplit.getTable()); resultScanner = table.getScanner(scan); //System.out.println(Bytes.toStringBinary(scan.getStartRow())); //System.out.println(Bytes.toStringBinary(scan.getStopRow())); /* System.out.println(Bytes.toString(Bytes.toBytes(scan.getInputColumns()))); Get get = new Get(scan.getStartRow()); Result re; System.out.println("iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii"); while((re = resultScanner.next())!=null){ System.out.println("o"); System.out.println(re.size()); //System.out.println(String.format("%s$$%s ", var1, Bytes.toString(list.next().getQualifier()))); } System.exit(1);*/ result = resultScanner.next(); more = false; if (result == null) { empty = true; } else { more = true; list = result.list().iterator(); kv = list.next(); } Configuration conf = context.getConfiguration(); String newjoinVars = conf.get("input.patId"); String joinVars = newjoinVars.split(tsplit.getFname())[1]; joinVars = joinVars.substring(0, joinVars.indexOf("$$") - 1); String vars = tsplit.getVars(); StringTokenizer vtok = new StringTokenizer(vars); varsno = 0; while (vtok.hasMoreTokens()) { vtok.nextToken(); varsno++; } if (varsno == 1) { StringTokenizer vtok2 = new StringTokenizer(vars); v1 = vtok2.nextToken(); } else if (varsno == 2) { StringTokenizer vtok2 = new StringTokenizer(vars); v1 = vtok2.nextToken(); v2 = vtok2.nextToken(); } }
From source file:gr.ntua.h2rdf.inputFormat.MultiHFileOutputFormat.java
License:Open Source License
public RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(final TaskAttemptContext context) throws IOException, InterruptedException { return new RecordWriter<ImmutableBytesWritable, KeyValue>() { @Override// ww w. ja va 2 s .c om public void close(TaskAttemptContext context) throws IOException, InterruptedException { for (RecordWriter<ImmutableBytesWritable, KeyValue> writer : writers.values()) { writer.close(context); } } @Override public void write(ImmutableBytesWritable key, KeyValue value) throws IOException, InterruptedException { RecordWriter<ImmutableBytesWritable, KeyValue> writer = writers.get(key); if (writer == null) { final Path outputPath = new Path( FileOutputFormat.getOutputPath(context).toString() + "/" + Bytes.toString(key.get())); writer = new RecordWriter<ImmutableBytesWritable, KeyValue>() { final FileOutputCommitter committer = new FileOutputCommitter(outputPath, context); final Path outputdir = committer.getWorkPath(); final Configuration conf = context.getConfiguration(); final FileSystem fs = outputdir.getFileSystem(conf); final long maxsize = conf.getLong("hbase.hregion.max.filesize", HConstants.DEFAULT_MAX_FILE_SIZE); final int blocksize = conf.getInt("hfile.min.blocksize.size", HFile.DEFAULT_BLOCKSIZE); // Invented config. Add to hbase-*.xml if other than default compression. final String compression = conf.get("hfile.compression", Compression.Algorithm.NONE.getName()); // Map of families to writers and how much has been output on the writer. final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>( Bytes.BYTES_COMPARATOR); byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY; final byte[] now = Bytes.toBytes(System.currentTimeMillis()); boolean rollRequested = false; public void write(ImmutableBytesWritable row, KeyValue kv) throws IOException { // null input == user explicitly wants to flush if (row == null && kv == null) { rollWriters(); return; } byte[] rowKey = kv.getRow(); long length = kv.getLength(); byte[] family = kv.getFamily(); WriterLength wl = this.writers.get(family); // If this is a new column family, verify that the directory exists if (wl == null) { fs.mkdirs(new Path(outputdir, Bytes.toString(family))); } // If any of the HFiles for the column families has reached // maxsize, we need to roll all the writers if (wl != null && wl.written + length >= maxsize) { this.rollRequested = true; } // This can only happen once a row is finished though if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) { rollWriters(); } // create a new HLog writer, if necessary if (wl == null || wl.writer == null) { wl = getNewWriter(family); } // we now have the proper HLog writer. full steam ahead kv.updateLatestStamp(this.now); wl.writer.append(kv); wl.written += length; // Copy the row so we know when a row transition. this.previousRow = rowKey; } private void rollWriters() throws IOException { for (WriterLength wl : this.writers.values()) { if (wl.writer != null) { close(wl.writer); } wl.writer = null; wl.written = 0; } this.rollRequested = false; } private HFile.Writer getNewWriter(final HFile.Writer writer, final Path familydir, Configuration conf) throws IOException { if (writer != null) { close(writer); } return HFile.getWriterFactoryNoCache(conf).create(); //return HFile.getWriterFactory(conf).createWriter(fs, StoreFile.getUniqueFile(fs, familydir), // blocksize, compression, KeyValue.KEY_COMPARATOR); // return new HFile.Writer(fs, StoreFile.getUniqueFile(fs, familydir), // blocksize, compression, KeyValue.KEY_COMPARATOR); } private WriterLength getNewWriter(byte[] family) throws IOException { WriterLength wl = new WriterLength(); Path familydir = new Path(outputdir, Bytes.toString(family)); wl.writer = getNewWriter(wl.writer, familydir, conf); this.writers.put(family, wl); return wl; } private void close(final HFile.Writer w) throws IOException { if (w != null) { w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes.toBytes(System.currentTimeMillis())); w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes.toBytes(context.getTaskAttemptID().toString())); w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true)); w.close(); } } public void close(TaskAttemptContext c) throws IOException, InterruptedException { for (WriterLength wl : this.writers.values()) { close(wl.writer); } committer.commitTask(c); } }; writers.put(key, writer); } writer.write(new ImmutableBytesWritable(value.getRow()), value); } }; }
From source file:gr.ntua.h2rdf.inputFormat.MyLineRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { MyFileSplit split = (MyFileSplit) (MyInputSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from w w w . jav a 2s . com end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:gr.ntua.h2rdf.inputFormat2.MultiTableInputFormatBase.java
License:Open Source License
/** * Builds a TableRecordReader. If no TableRecordReader was provided, uses the * default.// w ww . ja va 2 s . c o m * * @param split The split to work with. * @param context The current context. * @return The newly created record reader. * @throws IOException When creating the reader fails. * @throws InterruptedException when record reader initialization fails * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader( * org.apache.hadoop.mapreduce.InputSplit, * org.apache.hadoop.mapreduce.TaskAttemptContext) */ @Override public RecordReader<Bindings, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { TableSplit tSplit = (TableSplit) split; if (tSplit.getTableName() == null) { throw new IOException("Cannot create a record reader because of a" + " previous error. Please look at the previous logs lines from" + " the task's full log for more details."); } HTable table = new HTable(context.getConfiguration(), tSplit.getTableName()); TableRecordReader trr = this.tableRecordReader; // if no table record reader was provided use default if (trr == null) { trr = new TableRecordReader(); } Scan sc = tSplit.getScan(); sc.setStartRow(tSplit.getStartRow()); sc.setStopRow(tSplit.getEndRow()); trr.setScan(sc); trr.setHTable(table); trr.initialize(split, context); return trr; }
From source file:hadoop.inputsplit.FastaLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); done = false;// w w w .jav a 2 s .c om this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); currentValue = new ValueWritable(); value = new Text(); tmpValue = new Text(); tmp = new Text(); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); String homeHdfs = context.getConfiguration().get("HDFS_HOME_DIR"); //maxK = HadoopUtil.getMaxkFromPatterns(fs, new Path(homeHdfs+Constant.HDFS_PATTERNS_FILE_HDFS)); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new LineReader(cIn, job, recordDelimiterBytes); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiterBytes); filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job, recordDelimiterBytes); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; setKeySeq(fs, job); //Set currentKey nextMyKeyValue(); //Leggo il primo record se esiste. }
From source file:hadoop.TweetRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); start = split.getStart();/*from ww w . j a va2 s.c o m*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new LineReader(cIn, job); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job); filePosition = fileIn; } this.pos = start; }