Example usage for org.apache.hadoop.io BytesWritable getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io BytesWritable getLength.

Prototype

@Override
public int getLength()

Source Link

Document

Get the current size of the buffer.

Usage

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileRecordReaderTests.java

License:Apache License

@Test
public void TestARCFileRecordReader() throws IOException, InterruptedException {

    Configuration conf = new Configuration();
    FileSystem fs = LocalFileSystem.get(conf);
    Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test"));
    List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    FSDataOutputStream os = fs.create(path);
    try {/*w w  w . j  a  va2  s  .  c  o m*/
        // write the ARC File into memory 
        ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis());

        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length,
                    new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345,
                    testAttemptTime);
        }
        os.flush();
    } finally {
        os.close();
    }

    FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]);
    ARCFileRecordReader reader = new ARCFileRecordReader();
    reader.initialize(split, new TaskAttemptContext(conf, new TaskAttemptID()));

    int index = 0;

    // iterate and validate stuff ... 
    while (reader.nextKeyValue()) {
        Text key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();

        TestRecord testRecord = records.get(index++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    fs.delete(path, false);
}

From source file:org.commoncrawl.util.CompressedURLFPListV2.java

License:Open Source License

public static void main(String[] args) {

    // initialize ...
    final Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");

    BasicConfigurator.configure();//from  w w  w  .j  a  v  a 2s .  c  om
    CrawlEnvironment.setHadoopConfig(conf);

    try {
        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

        Path testFile = new Path("crawl/linkdb/merged1282844121161/linkData/part-00000");
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, testFile, conf);

        URLFPV2 fp = new URLFPV2();
        BytesWritable bytes = new BytesWritable();

        while (reader.next(fp, bytes)) {
            if (bytes.getLength() != 0) {
                DataInputBuffer inputStream = new DataInputBuffer();
                inputStream.reset(bytes.get(), bytes.getLength());
                CompressedURLFPListV2.Reader listReader = new CompressedURLFPListV2.Reader(inputStream);
                while (listReader.hasNext()) {
                    URLFPV2 nextFP = listReader.next();
                    LOG.info("DH:" + nextFP.getDomainHash() + " UH:" + nextFP.getUrlHash());
                }
            } else {
                LOG.error("ZERO BYTE LIST!");
            }
        }

        reader.close();
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    }

    if (1 == 1)
        return;

    validateDuplicateChecking();
    // validateReallyBigList();
    validateURLFPSerializationRootDomain();
    validateURLFPSerializationSingleSubDomain();
    validateURLFPSerializationMultiDomain();
    validateURLFPFlagSerializationRootDomain();
    validateURLFPFlagSerializationMultipleSubDomains();
    validateURLFPFlagSerializationOneSubDomain();
}

From source file:org.commoncrawl.util.shared.ARCFileReader.java

License:Apache License

public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException {

    Configuration conf = new Configuration();

    String path = null;//from  w w w  .j  a  v  a  2 s.co m

    CommandLineParser parser = new GnuParser();

    try {
        // parse the command line arguments
        CommandLine cmdLine = parser.parse(options, args);

        // get ARCFile Path
        path = cmdLine.getOptionValue("file");

        // get optional config 
        if (cmdLine.hasOption("conf")) {
            conf.addResource(new Path(cmdLine.getOptionValue("conf")));
        }
        if (cmdLine.hasOption("awsAccessKey")) {
            conf.set("fs.s3n.awsAccessKeyId", cmdLine.getOptionValue("awsAccessKey"));
        }
        if (cmdLine.hasOption("awsSecret")) {
            conf.set("fs.s3n.awsSecretAccessKey", cmdLine.getOptionValue("awsSecret"));
        }
    } catch (ParseException e) {
        System.out.println(e.toString());
        printUsage();
        System.exit(1);
    }

    final URI uri = new URI(path);
    FileSystem fs = FileSystem.get(uri, conf);

    //    byte data[] = new byte[4096*10];
    //    int readAmt = 0;
    //    while ((readAmt = stream.get().read(data)) != -1) { 
    //      System.out.println(HexDump.dumpHexString(data, 0, readAmt));
    //    }
    //    stream.get().close();
    //    System.exit(1);

    ARCFileReader reader = null;

    try {
        System.out.println("Initializing Reader for Path:" + uri);
        reader = new ARCFileReader(fs.open(new Path(path)));

        Text key = new Text();
        BytesWritable value = new BytesWritable();

        while (reader.hasMoreItems()) {
            reader.nextKeyValue(key, value);
            int indexOfTrailingCRLF = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                    "\r\n\r\n".getBytes());
            int headerLen = indexOfTrailingCRLF + 4;
            int contentLen = value.getLength() - headerLen;

            String outputStr = "Key:" + key.toString() + " HeaderLen:" + headerLen + " ContentLen:"
                    + contentLen;
            System.out.println(outputStr);

            //String contentStr = new String(value.getBytes(),headerLen,contentLen,Charset.forName("ASCII"));
            //System.out.println(contentStr.substring(contentStr.length() - 20));
        }
        System.out.println("Exiting Loop");
    } catch (Exception e) {
        System.out.println(CCStringUtils.stringifyException(e));
        LOG.error(CCStringUtils.stringifyException(e));
        //throw new IOException(e);
    } finally {
        if (reader != null) {
            System.out.println("***Closing Reader");
            reader.close();
        }
    }
}

From source file:org.commoncrawl.util.shared.ArcFileReaderTests.java

License:Apache License

/** 
 * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... 
 *//*from  w w  w.j a v a2s.  c  o m*/
@Test
public void testReader() {
    DataOutputBuffer os = new DataOutputBuffer();
    long timestamp = System.currentTimeMillis();
    try {
        // write the ARC File into memory 
        writeFirstRecord(os, "test", timestamp);
        List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT);
        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            NIOHttpHeaders headers = new NIOHttpHeaders();
            for (int i = 0; i < record.headers.size(); ++i) {
                headers.set(record.headers.get(i).e0, record.headers.get(i).e1);
            }

            write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html",
                    MD5Hash.digest(record.data).toString(), 12345, testAttemptTime);
        }
        os.flush();
        os.close();

        final AtomicBoolean streamClosed = new AtomicBoolean();
        // setup ArcFileReader to read the file 
        InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) {

            public synchronized int read(byte b[], int off, int len) {
                len = 1;
                return super.read(b, off, len);
            }

            public void close() throws IOException {
                super.close();
                streamClosed.set(true);
            }
        };
        ARCFileReader reader = new ARCFileReader(in);
        int index = 0;
        Text key = new Text();
        BytesWritable value = new BytesWritable();

        // iterate and validate stuff ... 
        while (reader.hasMoreItems()) {
            reader.nextKeyValue(key, value);
            TestRecord testRecord = records.get(index++);
            // get test key bytes as utf-8 bytes ... 
            byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
            // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
            // with ?, which causes our test case (which does use invalid characters to from the key, to break.
            Assert.assertTrue(
                    compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);
            // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
            // we search for this specific byte pattern to locate start of content, then compare it against source ... 
            int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                    "\r\n\r\n".getBytes());
            if (indexofHeaderTerminator == -1) {
                throw new IOException("No Header Terminator found in Value!");
            }
            indexofHeaderTerminator += 4;
            // read headers ... 
            String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator,
                    Charset.forName("UTF-8"));
            NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText);
            for (int i = 0; i < testRecord.headers.size(); ++i) {
                Pair<String, String> testHeaderRecord = testRecord.headers.get(i);
                Assert.assertNotNull(headers.findValue(testHeaderRecord.e0));
                Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0));
            }

            Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(),
                    indexofHeaderTerminator, testRecord.data.length) == 0);
        }
        reader.close();

        Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT);
        Assert.assertTrue(streamClosed.get());
    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}

From source file:org.elasticsearch.hadoop.mr.MapReduceWriter.java

License:Apache License

@SuppressWarnings("unchecked")
public boolean write(Writable writable, Generator generator) {
    if (writable == null || writable instanceof NullWritable) {
        generator.writeNull();//from  w  w  w . j  a v  a 2 s.c  o  m
    } else if (writable instanceof Text) {
        Text text = (Text) writable;
        generator.writeUTF8String(text.getBytes(), 0, text.getLength());
    } else if (writable instanceof UTF8) {
        UTF8 utf8 = (UTF8) writable;
        generator.writeUTF8String(utf8.getBytes(), 0, utf8.getLength());
    } else if (writable instanceof IntWritable) {
        generator.writeNumber(((IntWritable) writable).get());
    } else if (writable instanceof LongWritable) {
        generator.writeNumber(((LongWritable) writable).get());
    } else if (writable instanceof VLongWritable) {
        generator.writeNumber(((VLongWritable) writable).get());
    } else if (writable instanceof VIntWritable) {
        generator.writeNumber(((VIntWritable) writable).get());
    } else if (writable instanceof ByteWritable) {
        generator.writeNumber(((ByteWritable) writable).get());
    } else if (writable instanceof DoubleWritable) {
        generator.writeNumber(((DoubleWritable) writable).get());
    } else if (writable instanceof FloatWritable) {
        generator.writeNumber(((FloatWritable) writable).get());
    } else if (writable instanceof BooleanWritable) {
        generator.writeBoolean(((BooleanWritable) writable).get());
    } else if (writable instanceof BytesWritable) {
        BytesWritable bw = (BytesWritable) writable;
        generator.writeBinary(bw.getBytes(), 0, bw.getLength());
    } else if (writable instanceof MD5Hash) {
        generator.writeString(writable.toString());
    }

    else if (writable instanceof ArrayWritable) {
        generator.writeBeginArray();
        for (Writable wrt : ((ArrayWritable) writable).get()) {
            if (!write(wrt, generator)) {
                return false;
            }
        }
        generator.writeEndArray();
    }

    else if (writable instanceof AbstractMapWritable) {
        Map<Writable, Writable> map = (Map<Writable, Writable>) writable;

        generator.writeBeginObject();
        // ignore handling sets (which are just maps with null values)
        for (Entry<Writable, Writable> entry : map.entrySet()) {
            generator.writeFieldName(entry.getKey().toString());
            if (!write(entry.getValue(), generator)) {
                return false;
            }
        }
        generator.writeEndObject();
    } else {
        if (writeUnknownTypes) {
            return handleUnknown(writable, generator);
        }
        return false;
    }
    return true;
}

From source file:org.elasticsearch.hadoop.mr.SafeWritableConverter.java

License:Apache License

public void invoke(Object from, BytesArray to) {
    // handle common cases
    if (from instanceof Text) {
        Text t = (Text) from;
        to.bytes(t.getBytes(), t.getLength());
    }/*from  ww  w .  j  a  v  a  2s .c o m*/
    if (from instanceof BytesWritable) {
        BytesWritable b = (BytesWritable) from;
        to.bytes(b.getBytes(), b.getLength());
    }
}

From source file:org.elasticsearch.hadoop.mr.WritableBytesConverter.java

License:Apache License

@Override
public void convert(Object from, BytesArray to) {
    // handle common cases
    if (from instanceof Text) {
        Text t = (Text) from;
        to.bytes(t.getBytes(), t.getLength());
        return;/*from  w  w w  .  ja  va  2s .  c  o m*/
    }
    if (from instanceof BytesWritable) {
        BytesWritable b = (BytesWritable) from;
        to.bytes(b.getBytes(), b.getLength());
        return;
    }

    super.convert(from, to);
}

From source file:org.elasticsearch.hadoop.mr.WritableValueWriter.java

License:Apache License

@Override
@SuppressWarnings({ "unchecked", "deprecation" })
public Result write(Writable writable, Generator generator) {
    if (writable == null || writable instanceof NullWritable) {
        generator.writeNull();// w w  w .  j a  v  a 2  s .com
    } else if (writable instanceof Text) {
        Text text = (Text) writable;
        generator.writeUTF8String(text.getBytes(), 0, text.getLength());
    } else if (writable instanceof UTF8) {
        UTF8 utf8 = (UTF8) writable;
        generator.writeUTF8String(utf8.getBytes(), 0, utf8.getLength());
    } else if (WritableCompatUtil.isShortWritable(writable)) {
        generator.writeNumber(WritableCompatUtil.unwrap(writable));
    } else if (writable instanceof IntWritable) {
        generator.writeNumber(((IntWritable) writable).get());
    } else if (writable instanceof LongWritable) {
        generator.writeNumber(((LongWritable) writable).get());
    } else if (writable instanceof VLongWritable) {
        generator.writeNumber(((VLongWritable) writable).get());
    } else if (writable instanceof VIntWritable) {
        generator.writeNumber(((VIntWritable) writable).get());
    } else if (writable instanceof ByteWritable) {
        generator.writeNumber(((ByteWritable) writable).get());
    } else if (writable instanceof DoubleWritable) {
        generator.writeNumber(((DoubleWritable) writable).get());
    } else if (writable instanceof FloatWritable) {
        generator.writeNumber(((FloatWritable) writable).get());
    } else if (writable instanceof BooleanWritable) {
        generator.writeBoolean(((BooleanWritable) writable).get());
    } else if (writable instanceof BytesWritable) {
        BytesWritable bw = (BytesWritable) writable;
        generator.writeBinary(bw.getBytes(), 0, bw.getLength());
    } else if (writable instanceof MD5Hash) {
        generator.writeString(writable.toString());
    }

    else if (writable instanceof ArrayWritable) {
        generator.writeBeginArray();
        for (Writable wrt : ((ArrayWritable) writable).get()) {
            Result result = write(wrt, generator);
            if (!result.isSuccesful()) {
                return result;
            }
        }
        generator.writeEndArray();
    }

    else if (writable instanceof AbstractMapWritable) {
        Map<Writable, Writable> map = (Map<Writable, Writable>) writable;

        generator.writeBeginObject();
        // ignore handling sets (which are just maps with null values)
        for (Entry<Writable, Writable> entry : map.entrySet()) {
            String fieldName = entry.getKey().toString();
            if (shouldKeep(generator.getParentPath(), fieldName)) {
                generator.writeFieldName(fieldName);
                Result result = write(entry.getValue(), generator);

                if (!result.isSuccesful()) {
                    return result;
                }
            }
        }
        generator.writeEndObject();
    } else {
        if (writeUnknownTypes) {
            return handleUnknown(writable, generator);
        }
        return Result.FAILED(writable);
    }
    return Result.SUCCESFUL();
}

From source file:org.freeeed.mr.FreeEedReducer.java

License:Apache License

protected void processMap(MapWritable value) throws IOException, InterruptedException {
    columnMetadata.reinit();/*from ww  w  .  j  av a 2s .  c  o  m*/
    ++outputFileCount;
    DocumentMetadata allMetadata = getAllMetadata(value);
    Metadata standardMetadata = getStandardMetadata(allMetadata, outputFileCount);
    columnMetadata.addMetadata(standardMetadata);
    columnMetadata.addMetadata(allMetadata);
    // documents other than the first one in this loop are either duplicates or attachments
    if (first) {
        masterOutputFileCount = outputFileCount;
    } else {
        if (allMetadata.hasParent()) {
            columnMetadata.addMetadataValue(DocumentMetadataKeys.ATTACHMENT_PARENT,
                    UPIFormat.format(masterOutputFileCount));
        } else {
            columnMetadata.addMetadataValue(DocumentMetadataKeys.MASTER_DUPLICATE,
                    UPIFormat.format(masterOutputFileCount));
        }
    }

    //String uniqueId = allMetadata.getUniqueId();

    String originalFileName = new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName();
    // add the text to the text folder
    String documentText = allMetadata.get(DocumentMetadataKeys.DOCUMENT_TEXT);
    String textEntryName = ParameterProcessing.TEXT + "/" + UPIFormat.format(outputFileCount) + "_"
            + originalFileName + ".txt";
    if (textEntryName != null) {
        zipFileWriter.addTextFile(textEntryName, documentText);
    }
    columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_TEXT, textEntryName);
    // add the native file to the native folder
    String nativeEntryName = ParameterProcessing.NATIVE + "/" + UPIFormat.format(outputFileCount) + "_"
            + originalFileName;
    BytesWritable bytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE));
    if (bytesWritable != null) { // some large exception files are not passed
        zipFileWriter.addBinaryFile(nativeEntryName, bytesWritable.getBytes(), bytesWritable.getLength());
        logger.trace("Processing file: {}", nativeEntryName);
    }
    columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_NATIVE, nativeEntryName);
    // add the pdf made from native to the PDF folder
    String pdfNativeEntryName = ParameterProcessing.PDF_FOLDER + "/" + UPIFormat.format(outputFileCount) + "_"
            + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName() + ".pdf";
    BytesWritable pdfBytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_PDF));
    if (pdfBytesWritable != null) {
        zipFileWriter.addBinaryFile(pdfNativeEntryName, pdfBytesWritable.getBytes(),
                pdfBytesWritable.getLength());
        logger.trace("Processing file: {}", pdfNativeEntryName);
    }

    processHtmlContent(value, allMetadata, UPIFormat.format(outputFileCount));

    // add exception to the exception folder
    String exception = allMetadata.get(DocumentMetadataKeys.PROCESSING_EXCEPTION);
    if (exception != null) {
        String exceptionEntryName = "exception/" + UPIFormat.format(outputFileCount) + "_"
                + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName();
        if (bytesWritable != null) {
            zipFileWriter.addBinaryFile(exceptionEntryName, bytesWritable.getBytes(),
                    bytesWritable.getLength());
        }
        columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_EXCEPTION, exceptionEntryName);
    }
    // write this all to the reduce map
    //context.write(new Text(outputKey), new Text(columnMetadata.delimiterSeparatedValues()));
    // drop the key altogether, because it messes up the format - but put it in the value
    // TODO use NullWritable
    if (OsUtil.isNix()) {
        context.write(null, new Text(columnMetadata.delimiterSeparatedValues()));
    }
    // prepare for the next file with the same key, if there is any
    first = false;
}

From source file:org.freeeed.mr.FreeEedReducer.java

License:Apache License

private void processHtmlContent(MapWritable value, Metadata allMetadata, String uniqueId) throws IOException {
    BytesWritable htmlBytesWritable = (BytesWritable) value
            .get(new Text(ParameterProcessing.NATIVE_AS_HTML_NAME));
    if (htmlBytesWritable != null) {
        String htmlNativeEntryName = ParameterProcessing.HTML_FOLDER + "/" + uniqueId + "_"
                + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName() + ".html";
        zipFileWriter.addBinaryFile(htmlNativeEntryName, htmlBytesWritable.getBytes(),
                htmlBytesWritable.getLength());
        logger.trace("Processing file: {}", htmlNativeEntryName);
    }//w w w. ja v a 2s  . com

    // get the list with other files part of the html output
    Text htmlFiles = (Text) value.get(new Text(ParameterProcessing.NATIVE_AS_HTML));
    if (htmlFiles != null) {
        String fileNames = htmlFiles.toString();
        String[] fileNamesArr = fileNames.split(",");
        for (String fileName : fileNamesArr) {
            String entry = ParameterProcessing.HTML_FOLDER + "/" + fileName;

            BytesWritable imageBytesWritable = (BytesWritable) value
                    .get(new Text(ParameterProcessing.NATIVE_AS_HTML + "_" + fileName));
            if (imageBytesWritable != null) {
                zipFileWriter.addBinaryFile(entry, imageBytesWritable.getBytes(),
                        imageBytesWritable.getLength());
                logger.trace("Processing file: {}", entry);
            }
        }
    }
}