Example usage for org.apache.hadoop.io BytesWritable getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io BytesWritable getLength.

Prototype

@Override
public int getLength()

Source Link

Document

Get the current size of the buffer.

Usage

From source file:org.freeeed.mr.MetadataWriter.java

License:Apache License

public void processMap(MapWritable value) throws IOException {
    columnMetadata.reinit();/*from  www  . j a v  a2 s.co m*/

    DocumentMetadata allMetadata = getAllMetadata(value);

    Metadata standardMetadata = getStandardMetadata(allMetadata);
    columnMetadata.addMetadata(standardMetadata);
    columnMetadata.addMetadata(allMetadata);

    // TODO deal with attachments
    if (allMetadata.hasParent()) {
        columnMetadata.addMetadataValue(DocumentMetadataKeys.ATTACHMENT_PARENT,
                ParameterProcessing.UPIFormat.format(masterOutputFileCount));
    }

    //String uniqueId = allMetadata.getUniqueId();
    String originalFileName = new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName();
    // add the text to the text folder
    String documentText = allMetadata.get(DocumentMetadataKeys.DOCUMENT_TEXT);
    String textEntryName = ParameterProcessing.TEXT + "/" + allMetadata.getUniqueId() + "_" + originalFileName
            + ".txt";
    if (textEntryName != null) {
        zipFileWriter.addTextFile(textEntryName, documentText);
    }
    columnMetadata.addMetadataValue(DocumentMetadata.TEXT_LINK(), textEntryName);
    // add the native file to the native folder
    String nativeEntryName = ParameterProcessing.NATIVE + "/" + allMetadata.getUniqueId() + "_"
            + originalFileName;
    BytesWritable bytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE));
    if (bytesWritable != null) { // some large exception files are not passed
        zipFileWriter.addBinaryFile(nativeEntryName, bytesWritable.getBytes(), bytesWritable.getLength());
        LOGGER.trace("Processing file: {}", nativeEntryName);
    }
    columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_NATIVE, nativeEntryName);
    // add the pdf made from native to the PDF folder
    String pdfNativeEntryName = ParameterProcessing.PDF_FOLDER + "/" + allMetadata.getUniqueId() + "_"
            + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName() + ".pdf";
    BytesWritable pdfBytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_PDF));
    if (pdfBytesWritable != null) {
        zipFileWriter.addBinaryFile(pdfNativeEntryName, pdfBytesWritable.getBytes(),
                pdfBytesWritable.getLength());
        LOGGER.trace("Processing file: {}", pdfNativeEntryName);
    }

    processHtmlContent(value, allMetadata, allMetadata.getUniqueId());

    // add exception to the exception folder
    String exception = allMetadata.get(DocumentMetadataKeys.PROCESSING_EXCEPTION);
    if (exception != null) {
        String exceptionEntryName = "exception/" + allMetadata.getUniqueId() + "_"
                + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName();
        if (bytesWritable != null) {
            zipFileWriter.addBinaryFile(exceptionEntryName, bytesWritable.getBytes(),
                    bytesWritable.getLength());
        }
        columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_EXCEPTION, exceptionEntryName);
    }
    appendMetadata(columnMetadata.delimiterSeparatedValues());
    // prepare for the next file with the same key, if there is any
    first = false;
}

From source file:org.freeeed.mr.MetadataWriter.java

License:Apache License

private void processHtmlContent(MapWritable value, Metadata allMetadata, String uniqueId) throws IOException {
    BytesWritable htmlBytesWritable = (BytesWritable) value
            .get(new Text(ParameterProcessing.NATIVE_AS_HTML_NAME));
    if (htmlBytesWritable != null) {
        String htmlNativeEntryName = ParameterProcessing.HTML_FOLDER + "/" + uniqueId + "_"
                + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName() + ".html";
        zipFileWriter.addBinaryFile(htmlNativeEntryName, htmlBytesWritable.getBytes(),
                htmlBytesWritable.getLength());
        LOGGER.trace("Processing file: {}", htmlNativeEntryName);

        // get the list with other files part of the html output
        Text htmlFiles = (Text) value.get(new Text(ParameterProcessing.NATIVE_AS_HTML));
        if (htmlFiles != null) {
            String fileNames = htmlFiles.toString();
            String[] fileNamesArr = fileNames.split(",");
            for (String fileName : fileNamesArr) {
                String entry = ParameterProcessing.HTML_FOLDER + "/" + fileName;

                BytesWritable imageBytesWritable = (BytesWritable) value
                        .get(new Text(ParameterProcessing.NATIVE_AS_HTML + "/" + fileName));
                if (imageBytesWritable != null) {
                    zipFileWriter.addBinaryFile(entry, imageBytesWritable.getBytes(),
                            imageBytesWritable.getLength());
                    LOGGER.trace("Processing file: {}", entry);
                }/* ww w.  jav a2 s .  c o m*/
            }
        }
    }
}

From source file:org.icgc.dcc.release.core.hadoop.SmileSequenceFileInputStream.java

License:Open Source License

private static byte[] getBytes(BytesWritable bw) {
    byte[] padded = bw.getBytes();
    byte[] bytes = new byte[bw.getLength()];
    System.arraycopy(padded, 0, bytes, 0, bytes.length);

    return bytes;
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java

License:Apache License

@Test
public void readBitcoinRawBlockInputFormatGenesisBlock() throws IOException {
    JobConf job = new JobConf(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "genesis.blk";
    String fileNameGenesis = classLoader.getResource("testdata/" + fileName).getFile();
    Path file = new Path(fileNameGenesis);
    FileInputFormat.setInputPaths(job, file);
    BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
    format.configure(job);/* www.  j a  va  2s.  co m*/
    InputSplit[] inputSplits = format.getSplits(job, 1);
    assertEquals("Only one split generated for genesis block", 1, inputSplits.length);
    RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull("Format returned  null RecordReader", reader);
    BytesWritable genesisKey = new BytesWritable();
    BytesWritable genesisBlock = new BytesWritable();
    assertTrue("Input Split for genesis block contains at least one block",
            reader.next(genesisKey, genesisBlock));
    assertEquals("Genesis Block must have size of 293", 293, genesisBlock.getLength());
    BytesWritable emptyKey = new BytesWritable();
    BytesWritable emptyBlock = new BytesWritable();
    assertFalse("No further blocks in genesis Block", reader.next(emptyKey, emptyBlock));
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java

License:Apache License

@Test
public void readBitcoinRawBlockInputFormatBlockVersion1() throws IOException {
    JobConf job = new JobConf(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "version1.blk";
    String fileNameBlock = classLoader.getResource("testdata/" + fileName).getFile();
    Path file = new Path(fileNameBlock);
    FileInputFormat.setInputPaths(job, file);
    BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
    format.configure(job);//from   www .j a v  a2s  .  c o  m
    InputSplit[] inputSplits = format.getSplits(job, 1);
    assertEquals("Only one split generated for block version 1", 1, inputSplits.length);
    RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull("Format returned  null RecordReader", reader);
    BytesWritable key = new BytesWritable();
    BytesWritable block = new BytesWritable();
    assertTrue("Input Split for block version contains at least one block", reader.next(key, block));
    assertEquals("Random block version 1  must have size of 482 bytes", 482, block.getLength());
    BytesWritable emptyKey = new BytesWritable();
    BytesWritable emptyBlock = new BytesWritable();
    assertFalse("No further blocks in block version 1", reader.next(emptyKey, emptyBlock));
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java

License:Apache License

@Test
public void readBitcoinRawBlockInputFormatBlockVersion2() throws IOException {
    JobConf job = new JobConf(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "version2.blk";
    String fileNameBlock = classLoader.getResource("testdata/" + fileName).getFile();
    Path file = new Path(fileNameBlock);
    FileInputFormat.setInputPaths(job, file);
    BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
    format.configure(job);/*  w  w w  .  j a v  a 2  s . c o m*/
    InputSplit[] inputSplits = format.getSplits(job, 1);
    assertEquals("Only one split generated for block version 2", 1, inputSplits.length);
    RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull("Format returned  null RecordReader", reader);
    BytesWritable key = new BytesWritable();
    BytesWritable block = new BytesWritable();
    assertTrue("Input Split for block version contains at least one block", reader.next(key, block));
    assertEquals("Random block version 2  must have size of 191.198 bytes", 191198, block.getLength());
    BytesWritable emptyKey = new BytesWritable();
    BytesWritable emptyBlock = new BytesWritable();
    assertFalse("No further blocks in block version 2", reader.next(emptyKey, emptyBlock));
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java

License:Apache License

@Test
public void readBitcoinRawBlockInputFormatBlockVersion3() throws IOException {
    JobConf job = new JobConf(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "version3.blk";
    String fileNameBlock = classLoader.getResource("testdata/" + fileName).getFile();
    Path file = new Path(fileNameBlock);
    FileInputFormat.setInputPaths(job, file);
    BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
    format.configure(job);/*from ww  w . j av a2s  .c o  m*/
    InputSplit[] inputSplits = format.getSplits(job, 1);
    assertEquals("Only one split generated for block version 3", 1, inputSplits.length);
    RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull("Format returned  null RecordReader", reader);
    BytesWritable key = new BytesWritable();
    BytesWritable block = new BytesWritable();
    assertTrue("Input Split for block version contains at least one block", reader.next(key, block));
    assertEquals("Random block version 3 must have size of 932.199 bytes", 932199, block.getLength());
    BytesWritable emptyKey = new BytesWritable();
    BytesWritable emptyBlock = new BytesWritable();
    assertFalse("No further blocks in block version 3", reader.next(emptyKey, emptyBlock));
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java

License:Apache License

@Test
public void readBitcoinRawBlockInputFormatBlockVersion4() throws IOException {
    JobConf job = new JobConf(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "version4.blk";
    String fileNameBlock = classLoader.getResource("testdata/" + fileName).getFile();
    Path file = new Path(fileNameBlock);
    FileInputFormat.setInputPaths(job, file);
    BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
    format.configure(job);/* ww w .  j  a v a2s  .co m*/
    InputSplit[] inputSplits = format.getSplits(job, 1);
    assertEquals("Only one split generated for block version 4", 1, inputSplits.length);
    RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull("Format returned  null RecordReader", reader);
    BytesWritable key = new BytesWritable();
    BytesWritable block = new BytesWritable();
    assertTrue("Input Split for block version contains at least one block", reader.next(key, block));
    assertEquals("Random block version 4 must have a size of 998.039 bytes", 998039, block.getLength());
    BytesWritable emptyKey = new BytesWritable();
    BytesWritable emptyBlock = new BytesWritable();
    assertFalse("No further blocks in block version 4", reader.next(emptyKey, emptyBlock));
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java

License:Apache License

@Test
public void readBitcoinRawBlockInputFormatReqSeekBlockVersion1() throws IOException {
    JobConf job = new JobConf(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "reqseekversion1.blk";
    String fileNameBlock = classLoader.getResource("testdata/" + fileName).getFile();
    Path file = new Path(fileNameBlock);
    FileInputFormat.setInputPaths(job, file);
    BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
    format.configure(job);//from  ww  w  .  ja v a 2 s.c  om
    InputSplit[] inputSplits = format.getSplits(job, 1);
    assertEquals("Only one split generated for block requiring seek version 1", 1, inputSplits.length);
    RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull("Format returned  null RecordReader", reader);
    BytesWritable key = new BytesWritable();
    BytesWritable block = new BytesWritable();
    assertTrue("Input Split for block version contains at least one block", reader.next(key, block));
    assertEquals("Random block requiring seek version 1 must have a size of 482 bytes", 482, block.getLength());
    BytesWritable emptyKey = new BytesWritable();
    BytesWritable emptyBlock = new BytesWritable();
    assertFalse("No further blocks in block requiring seek version 1", reader.next(emptyKey, emptyBlock));
}

From source file:org.zuinnote.hadoop.bitcoin.format.BitcoinFormatHadoopTest.java

License:Apache License

@Test
public void readBitcoinRawBlockInputFormatMultiBlock() throws IOException {
    JobConf job = new JobConf(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "multiblock.blk";
    String fileNameBlock = classLoader.getResource("testdata/" + fileName).getFile();
    Path file = new Path(fileNameBlock);
    FileInputFormat.setInputPaths(job, file);
    BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
    format.configure(job);/*from   ww w. ja va 2s  . com*/
    InputSplit[] inputSplits = format.getSplits(job, 1);
    assertEquals("Only one split generated for multiblock", 1, inputSplits.length);
    RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull("Format returned  null RecordReader", reader);
    BytesWritable key = new BytesWritable();
    BytesWritable block = new BytesWritable();
    assertTrue("Input Split for multi block contains the genesis block", reader.next(key, block));
    assertEquals("Genesis Block must have size of 293", 293, block.getLength());
    assertTrue("Input Split for block version contains block version 1", reader.next(key, block));
    assertEquals("Random block version 1  must have size of 482 bytes", 482, block.getLength());
    assertTrue("Input Split for block version contains block version 2", reader.next(key, block));
    assertEquals("Random block version 2  must have size of 191.198 bytes", 191198, block.getLength());
    BytesWritable emptyKey = new BytesWritable();
    BytesWritable emptyBlock = new BytesWritable();
    assertFalse("No further blocks in multi block", reader.next(emptyKey, emptyBlock));
}