List of usage examples for org.apache.hadoop.io Text getLength
@Override public int getLength()
From source file:org.cloudata.examples.web.TermUploadMap.java
License:Apache License
public void map(WritableComparable key, Writable value, OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException { collector.collect((Text) value, new Text("")); count++;/*from w w w .j a v a 2s . co m*/ if (count % 50000 == 0) { Text tValue = (Text) value; String keyStr = new String(tValue.getBytes(), 0, tValue.getLength(), "EUC-KR"); System.out.println(keyStr); } }
From source file:org.cloudata.examples.web.TermUploadReduce.java
License:Apache License
public void reduce(WritableComparable key, Iterator<Writable> values, OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException { if (exception != null) { throw exception; }/* w w w.ja v a 2 s .com*/ Text tKey = (Text) key; int keyIndex = tKey.find("\t"); if (keyIndex < 0) { LOG.error("invalid value:" + tKey); return; } Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, keyIndex); String keyStr = new String(tKey.getBytes(), keyIndex + 1, (tKey.getLength() - keyIndex - 1), "EUC-KR"); //term, ?(tf), documentId url, freq, weight //term, ?(df), df String[] valueTokens = keyStr.split("\t"); if (rowKey.getLength() < TestWebPage.MIN_TERM_LENGTH) { return; } count++; if (count % 50000 == 0) { System.out.println(new Date() + ":" + keyStr); } if (valueTokens.length == 2 && "df".equals(valueTokens[0])) { Row row = new Row(rowKey); row.addCell("df", new Cell(Cell.Key.EMPTY_KEY, valueTokens[1].getBytes())); dfUploader.put(row); } else if (valueTokens.length == 4 && "tf".equals(valueTokens[0])) { Row row = new Row(rowKey); String documentId = valueTokens[1]; String freq = valueTokens[2]; String weight = valueTokens[3]; row.addCell("tf", new Cell(new Cell.Key(documentId), freq.getBytes())); row.addCell("weight", new Cell(new Cell.Key(documentId), weight.getBytes())); byte[] documentIdBytes = documentId.getBytes(); row.addCell("i_weight", new Cell(new Cell.Key((df.format(1.0 - Double.parseDouble(weight)) + documentId).getBytes()), documentIdBytes)); weightUploader.put(row); } else { LOG.error("invalid value:" + valueTokens.length + "," + count + "," + valueTokens[1] + "," + keyStr); return; } }
From source file:org.cloudata.examples.web.TermWeightReduce.java
License:Apache License
public void reduce(WritableComparable key, Iterator<Writable> values, OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException { if (exception != null) { throw exception; }//from w ww . java 2s .c om //key: term, value: documentId , freq, docLength Text tKey = (Text) key; Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, tKey.getLength()); String keyStr = new String(tKey.getBytes(), 0, tKey.getLength(), "EUC-KR"); if (tKey.getLength() == 0 || keyStr.trim().length() < TestWebPage.MIN_TERM_LENGTH) { return; } List<Object[]> termFreqs = new ArrayList<Object[]>(100); Set<String> docs = new HashSet<String>(); while (values.hasNext()) { Text tValue = (Text) values.next(); String valueStr = tValue.toString(); String[] valueTokens = valueStr.split("\t"); if (valueTokens.length < 3) { LOG.error("valueTokens != 3:" + valueStr); return; } String documentId = valueTokens[0]; int freq = Integer.parseInt(valueTokens[1]); long docLength = Long.parseLong(valueTokens[2]); docs.add(documentId); termFreqs.add(new Object[] { documentId, freq, docLength }); if (termFreqs.size() > 100000) { LOG.info("Too many tf:term=" + keyStr); break; } } int numOfdocument = docs.size(); for (Object[] eachValue : termFreqs) { String documentId = (String) eachValue[0]; int freq = (Integer) eachValue[1]; long docLength = (Long) eachValue[2]; double weight = getTermWeight(freq, docLength, avgDocLength, sumDocCount, numOfdocument); collector.collect(tKey, new Text("tf\t" + documentId + "\t" + String.valueOf(freq) + "\t" + df.format(weight))); termCount++; if (termCount % 100000 == 0) { System.out.println("term=" + keyStr + ",document=" + documentId + ",freq=" + freq + ",df=" + numOfdocument + "weight=" + df.format(weight)); } } collector.collect(tKey, new Text("df\t" + numOfdocument)); if (termCount % 100 == 0) { partitionOut.write(tKey.getBytes()); partitionOut.write("\n".getBytes()); } }
From source file:org.cloudata.examples.web.TermWeightReduceOnline.java
License:Apache License
public void reduce(WritableComparable key, Iterator<Writable> values, OutputCollector<WritableComparable, Writable> collector, Reporter reporter) throws IOException { if (exception != null) { throw exception; }/* w w w .j a v a 2 s .c om*/ //key: term, value: documentId , freq, docLength Text tKey = (Text) key; Row.Key rowKey = new Row.Key(tKey.getBytes(), 0, tKey.getLength()); String keyStr = new String(tKey.getBytes(), 0, tKey.getLength(), "EUC-KR"); if (tKey.getLength() == 0 || keyStr.trim().length() < TestWebPage.MIN_TERM_LENGTH) { return; } Row row = termTable.get(rowKey, "df"); if (row == null || row.getColumnSize() == 0) { LOG.error("No df for term:" + keyStr); return; } int docFreq = row.getOne("df").getValue().getValueAsInt(); Row iRow = new Row(rowKey); int count = 0; List<ColumnValue> tfColumnValues = new ArrayList<ColumnValue>(); List<ColumnValue> weightColumnValues = new ArrayList<ColumnValue>(); List<ColumnValue> iWeightColumnValues = new ArrayList<ColumnValue>(); while (values.hasNext()) { Text tValue = (Text) values.next(); String valueStr = tValue.toString(); String[] valueTokens = valueStr.split("\t"); if (valueTokens.length < 3) { LOG.error("valueTokens != 3:" + valueStr); return; } String documentId = valueTokens[0]; int freq = Integer.parseInt(valueTokens[1]); long docLength = Long.parseLong(valueTokens[2]); double weight = getTermWeight(freq, docLength, avgDocLength, sumDocCount, docFreq); iRow.addCell("tf", new Cell(new Cell.Key(documentId), Integer.toString(freq).getBytes())); iRow.addCell("weigth", new Cell(new Cell.Key(documentId), Integer.toString(freq).getBytes())); byte[] documentIdBytes = documentId.getBytes(); iRow.addCell("i_weight", new Cell(new Cell.Key((df.format(1.0 - weight) + documentId).getBytes()), documentIdBytes)); if (termCount % 100000 == 0) { System.out.println("term=" + keyStr + ",document=" + documentId + ",freq=" + freq + ",df=" + docFreq + "weight=" + df.format(weight)); } termCount++; count++; if (count % 500 == 0) { try { termTable.put(iRow); } catch (Exception e) { LOG.error(e); } } iRow = new Row(rowKey); } try { termTable.put(iRow); } catch (Exception e) { LOG.error(e); } }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java
License:Apache License
static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits, RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException { int splitDataIndex = getIndexOfSplit(splits, split); Assert.assertTrue(splitDataIndex != -1); List<TestRecord> records = splits.get(splitDataIndex).e1; int itemIndex = 0; // iterate and validate stuff ... Text key = new Text(); BytesWritable value = new BytesWritable(); while (reader.next(key, value)) { TestRecord testRecord = records.get(itemIndex++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4;//from w w w . j a va 2 s .c o m Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); splits.remove(splitDataIndex); }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileInputFormatTests.java
License:Apache License
static void validateArcFileItemSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits, RecordReader<Text, ArcFileItem> reader) throws IOException, InterruptedException { int splitDataIndex = getIndexOfSplit(splits, split); Assert.assertTrue(splitDataIndex != -1); List<TestRecord> records = splits.get(splitDataIndex).e1; int itemIndex = 0; // iterate and validate stuff ... Text key = new Text(); ArcFileItem value = new ArcFileItem(); while (reader.next(key, value)) { TestRecord testRecord = records.get(itemIndex++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getContent().getReadOnlyBytes(), value.getContent().getOffset(), value.getContent().getCount()) == 0); NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems()); // validate metadata Assert.assertEquals("text/html", headers.findValue(Constants.ARCFileHeader_ARC_MimeType)); Assert.assertEquals(value.getArcFilePos(), testRecord.streamPos); Assert.assertEquals(value.getArcFileSize(), testRecord.rawSize); Assert.assertEquals("test-value", headers.findValue("test")); Assert.assertEquals(value.getArcFileName(), ((FileSplit) split).getPath().getName()); }//from w w w . j a v a 2s . c om reader.close(); Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); splits.remove(splitDataIndex); }
From source file:org.commoncrawl.hadoop.io.mapred.ArcFileRecordReaderTests.java
License:Apache License
@Test public void TestARCFileRecordReader() throws IOException, InterruptedException { JobConf conf = new JobConf(); FileSystem fs = LocalFileSystem.get(conf); Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test")); List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); FSDataOutputStream os = fs.create(path); try {/*w ww . j a v a 2s . com*/ // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); } finally { os.close(); } FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]); ARCFileRecordReader reader = new ARCFileRecordReader(); reader.initialize(conf, split); int index = 0; // iterate and validate stuff ... Text key = reader.createKey(); BytesWritable value = reader.createValue(); while (reader.next(key, value)) { TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); fs.delete(path, false); }
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java
License:Apache License
static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits, RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException { int splitDataIndex = getIndexOfSplit(splits, split); Assert.assertTrue(splitDataIndex != -1); List<TestRecord> records = splits.get(splitDataIndex).e1; int itemIndex = 0; // iterate and validate stuff ... while (reader.nextKeyValue()) { Text key = reader.getCurrentKey(); BytesWritable value = reader.getCurrentValue(); TestRecord testRecord = records.get(itemIndex++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4;/*from www . ja va 2 s. c o m*/ Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); splits.remove(splitDataIndex); }
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java
License:Apache License
static void validateArcFileItemSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits, RecordReader<Text, ArcFileItem> reader) throws IOException, InterruptedException { int splitDataIndex = getIndexOfSplit(splits, split); Assert.assertTrue(splitDataIndex != -1); List<TestRecord> records = splits.get(splitDataIndex).e1; int itemIndex = 0; // iterate and validate stuff ... while (reader.nextKeyValue()) { Text key = reader.getCurrentKey(); ArcFileItem value = reader.getCurrentValue(); TestRecord testRecord = records.get(itemIndex++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getContent().getReadOnlyBytes(), value.getContent().getOffset(), value.getContent().getCount()) == 0); NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(value.getHeaderItems()); // validate metadata Assert.assertEquals("text/html", headers.findValue(Constants.ARCFileHeader_ARC_MimeType)); Assert.assertEquals(value.getArcFilePos(), testRecord.streamPos); Assert.assertEquals(value.getArcFileSize(), testRecord.rawSize); Assert.assertEquals("test-value", headers.findValue("test")); Assert.assertEquals(value.getArcFileName(), ((FileSplit) split).getPath().getName()); }//w ww. java2 s . c om reader.close(); Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); splits.remove(splitDataIndex); }
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileRecordReaderTests.java
License:Apache License
@Test public void TestARCFileRecordReader() throws IOException, InterruptedException { Configuration conf = new Configuration(); FileSystem fs = LocalFileSystem.get(conf); Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test")); List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); FSDataOutputStream os = fs.create(path); try {/* w w w .j ava 2s .c o m*/ // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); } finally { os.close(); } FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]); ARCFileRecordReader reader = new ARCFileRecordReader(); reader.initialize(split, new TaskAttemptContext(conf, new TaskAttemptID())); int index = 0; // iterate and validate stuff ... while (reader.nextKeyValue()) { Text key = reader.getCurrentKey(); BytesWritable value = reader.getCurrentValue(); TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); fs.delete(path, false); }