List of usage examples for org.apache.hadoop.io BytesWritable getLength
@Override public int getLength()
From source file:org.freeeed.mr.MetadataWriter.java
License:Apache License
public void processMap(MapWritable value) throws IOException { columnMetadata.reinit();/*from www . j a v a2 s.co m*/ DocumentMetadata allMetadata = getAllMetadata(value); Metadata standardMetadata = getStandardMetadata(allMetadata); columnMetadata.addMetadata(standardMetadata); columnMetadata.addMetadata(allMetadata); // TODO deal with attachments if (allMetadata.hasParent()) { columnMetadata.addMetadataValue(DocumentMetadataKeys.ATTACHMENT_PARENT, ParameterProcessing.UPIFormat.format(masterOutputFileCount)); } //String uniqueId = allMetadata.getUniqueId(); String originalFileName = new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName(); // add the text to the text folder String documentText = allMetadata.get(DocumentMetadataKeys.DOCUMENT_TEXT); String textEntryName = ParameterProcessing.TEXT + "/" + allMetadata.getUniqueId() + "_" + originalFileName + ".txt"; if (textEntryName != null) { zipFileWriter.addTextFile(textEntryName, documentText); } columnMetadata.addMetadataValue(DocumentMetadata.TEXT_LINK(), textEntryName); // add the native file to the native folder String nativeEntryName = ParameterProcessing.NATIVE + "/" + allMetadata.getUniqueId() + "_" + originalFileName; BytesWritable bytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE)); if (bytesWritable != null) { // some large exception files are not passed zipFileWriter.addBinaryFile(nativeEntryName, bytesWritable.getBytes(), bytesWritable.getLength()); LOGGER.trace("Processing file: {}", nativeEntryName); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_NATIVE, nativeEntryName); // add the pdf made from native to the PDF folder String pdfNativeEntryName = ParameterProcessing.PDF_FOLDER + "/" + allMetadata.getUniqueId() + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName() + ".pdf"; BytesWritable pdfBytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_PDF)); if (pdfBytesWritable != null) { zipFileWriter.addBinaryFile(pdfNativeEntryName, pdfBytesWritable.getBytes(), pdfBytesWritable.getLength()); LOGGER.trace("Processing file: {}", pdfNativeEntryName); } processHtmlContent(value, allMetadata, allMetadata.getUniqueId()); // add exception to the exception folder String exception = allMetadata.get(DocumentMetadataKeys.PROCESSING_EXCEPTION); if (exception != null) { String exceptionEntryName = "exception/" + allMetadata.getUniqueId() + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName(); if (bytesWritable != null) { zipFileWriter.addBinaryFile(exceptionEntryName, bytesWritable.getBytes(), bytesWritable.getLength()); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_EXCEPTION, exceptionEntryName); } appendMetadata(columnMetadata.delimiterSeparatedValues()); // prepare for the next file with the same key, if there is any first = false; }
From source file:org.freeeed.mr.MetadataWriter.java
License:Apache License
private void processHtmlContent(MapWritable value, Metadata allMetadata, String uniqueId) throws IOException { BytesWritable htmlBytesWritable = (BytesWritable) value .get(new Text(ParameterProcessing.NATIVE_AS_HTML_NAME)); if (htmlBytesWritable != null) { String htmlNativeEntryName = ParameterProcessing.HTML_FOLDER + "/" + uniqueId + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName() + ".html"; zipFileWriter.addBinaryFile(htmlNativeEntryName, htmlBytesWritable.getBytes(), htmlBytesWritable.getLength()); LOGGER.trace("Processing file: {}", htmlNativeEntryName); // get the list with other files part of the html output Text htmlFiles = (Text) value.get(new Text(ParameterProcessing.NATIVE_AS_HTML)); if (htmlFiles != null) { String fileNames = htmlFiles.toString(); String[] fileNamesArr = fileNames.split(","); for (String fileName : fileNamesArr) { String entry = ParameterProcessing.HTML_FOLDER + "/" + fileName; BytesWritable imageBytesWritable = (BytesWritable) value .get(new Text(ParameterProcessing.NATIVE_AS_HTML + "/" + fileName)); if (imageBytesWritable != null) { zipFileWriter.addBinaryFile(entry, imageBytesWritable.getBytes(), imageBytesWritable.getLength()); LOGGER.trace("Processing file: {}", entry); }/* ww w. jav a2 s . c o m*/ } } } }
From source file:org.icgc.dcc.release.core.hadoop.SmileSequenceFileInputStream.java
License:Open Source License
private static byte[] getBytes(BytesWritable bw) { byte[] padded = bw.getBytes(); byte[] bytes = new byte[bw.getLength()]; System.arraycopy(padded, 0, bytes, 0, bytes.length); return bytes; }
From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java
License:Apache License
@Test public void readBitcoinRawBlockInputFormatGenesisBlock() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "genesis.blk"; String fileNameGenesis = classLoader.getResource("testdata/" + fileName).getFile(); Path file = new Path(fileNameGenesis); FileInputFormat.setInputPaths(job, file); BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat(); format.configure(job);/* www. j a va 2s. co m*/ InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals("Only one split generated for genesis block", 1, inputSplits.length); RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull("Format returned null RecordReader", reader); BytesWritable genesisKey = new BytesWritable(); BytesWritable genesisBlock = new BytesWritable(); assertTrue("Input Split for genesis block contains at least one block", reader.next(genesisKey, genesisBlock)); assertEquals("Genesis Block must have size of 293", 293, genesisBlock.getLength()); BytesWritable emptyKey = new BytesWritable(); BytesWritable emptyBlock = new BytesWritable(); assertFalse("No further blocks in genesis Block", reader.next(emptyKey, emptyBlock)); }
From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java
License:Apache License
@Test public void readBitcoinRawBlockInputFormatBlockVersion1() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "version1.blk"; String fileNameBlock = classLoader.getResource("testdata/" + fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat(); format.configure(job);//from www .j a v a2s . c o m InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals("Only one split generated for block version 1", 1, inputSplits.length); RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull("Format returned null RecordReader", reader); BytesWritable key = new BytesWritable(); BytesWritable block = new BytesWritable(); assertTrue("Input Split for block version contains at least one block", reader.next(key, block)); assertEquals("Random block version 1 must have size of 482 bytes", 482, block.getLength()); BytesWritable emptyKey = new BytesWritable(); BytesWritable emptyBlock = new BytesWritable(); assertFalse("No further blocks in block version 1", reader.next(emptyKey, emptyBlock)); }
From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java
License:Apache License
@Test public void readBitcoinRawBlockInputFormatBlockVersion2() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "version2.blk"; String fileNameBlock = classLoader.getResource("testdata/" + fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat(); format.configure(job);/* w w w . j a v a 2 s . c o m*/ InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals("Only one split generated for block version 2", 1, inputSplits.length); RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull("Format returned null RecordReader", reader); BytesWritable key = new BytesWritable(); BytesWritable block = new BytesWritable(); assertTrue("Input Split for block version contains at least one block", reader.next(key, block)); assertEquals("Random block version 2 must have size of 191.198 bytes", 191198, block.getLength()); BytesWritable emptyKey = new BytesWritable(); BytesWritable emptyBlock = new BytesWritable(); assertFalse("No further blocks in block version 2", reader.next(emptyKey, emptyBlock)); }
From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java
License:Apache License
@Test public void readBitcoinRawBlockInputFormatBlockVersion3() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "version3.blk"; String fileNameBlock = classLoader.getResource("testdata/" + fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat(); format.configure(job);/*from ww w . j av a2s .c o m*/ InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals("Only one split generated for block version 3", 1, inputSplits.length); RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull("Format returned null RecordReader", reader); BytesWritable key = new BytesWritable(); BytesWritable block = new BytesWritable(); assertTrue("Input Split for block version contains at least one block", reader.next(key, block)); assertEquals("Random block version 3 must have size of 932.199 bytes", 932199, block.getLength()); BytesWritable emptyKey = new BytesWritable(); BytesWritable emptyBlock = new BytesWritable(); assertFalse("No further blocks in block version 3", reader.next(emptyKey, emptyBlock)); }
From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java
License:Apache License
@Test public void readBitcoinRawBlockInputFormatBlockVersion4() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "version4.blk"; String fileNameBlock = classLoader.getResource("testdata/" + fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat(); format.configure(job);/* ww w . j a v a2s .co m*/ InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals("Only one split generated for block version 4", 1, inputSplits.length); RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull("Format returned null RecordReader", reader); BytesWritable key = new BytesWritable(); BytesWritable block = new BytesWritable(); assertTrue("Input Split for block version contains at least one block", reader.next(key, block)); assertEquals("Random block version 4 must have a size of 998.039 bytes", 998039, block.getLength()); BytesWritable emptyKey = new BytesWritable(); BytesWritable emptyBlock = new BytesWritable(); assertFalse("No further blocks in block version 4", reader.next(emptyKey, emptyBlock)); }
From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java
License:Apache License
@Test public void readBitcoinRawBlockInputFormatReqSeekBlockVersion1() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "reqseekversion1.blk"; String fileNameBlock = classLoader.getResource("testdata/" + fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat(); format.configure(job);//from ww w . ja v a 2 s.c om InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals("Only one split generated for block requiring seek version 1", 1, inputSplits.length); RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull("Format returned null RecordReader", reader); BytesWritable key = new BytesWritable(); BytesWritable block = new BytesWritable(); assertTrue("Input Split for block version contains at least one block", reader.next(key, block)); assertEquals("Random block requiring seek version 1 must have a size of 482 bytes", 482, block.getLength()); BytesWritable emptyKey = new BytesWritable(); BytesWritable emptyBlock = new BytesWritable(); assertFalse("No further blocks in block requiring seek version 1", reader.next(emptyKey, emptyBlock)); }
From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java
License:Apache License
@Test public void readBitcoinRawBlockInputFormatMultiBlock() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "multiblock.blk"; String fileNameBlock = classLoader.getResource("testdata/" + fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat(); format.configure(job);/*from ww w. ja va 2s . com*/ InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals("Only one split generated for multiblock", 1, inputSplits.length); RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull("Format returned null RecordReader", reader); BytesWritable key = new BytesWritable(); BytesWritable block = new BytesWritable(); assertTrue("Input Split for multi block contains the genesis block", reader.next(key, block)); assertEquals("Genesis Block must have size of 293", 293, block.getLength()); assertTrue("Input Split for block version contains block version 1", reader.next(key, block)); assertEquals("Random block version 1 must have size of 482 bytes", 482, block.getLength()); assertTrue("Input Split for block version contains block version 2", reader.next(key, block)); assertEquals("Random block version 2 must have size of 191.198 bytes", 191198, block.getLength()); BytesWritable emptyKey = new BytesWritable(); BytesWritable emptyBlock = new BytesWritable(); assertFalse("No further blocks in multi block", reader.next(emptyKey, emptyBlock)); }