List of usage examples for org.apache.hadoop.io Text getLength
@Override public int getLength()
From source file:org.apache.pig.builtin.JsonLoader.java
License:Apache License
public Tuple getNext() throws IOException { Text val = null; try {//from w w w . j av a 2 s . c om // Read the next key value pair from the record reader. If it's // finished, return null if (!reader.nextKeyValue()) return null; // Get the current value. We don't use the key. val = (Text) reader.getCurrentValue(); } catch (InterruptedException ie) { throw new IOException(ie); } // Create a parser specific for this input line. This may not be the // most efficient approach. byte[] newBytes = new byte[val.getLength()]; System.arraycopy(val.getBytes(), 0, newBytes, 0, val.getLength()); ByteArrayInputStream bais = new ByteArrayInputStream(newBytes); JsonParser p = jsonFactory.createJsonParser(bais); // Create the tuple we will be returning. We create it with the right // number of fields, as the Tuple object is optimized for this case. ResourceFieldSchema[] fields = schema.getFields(); Tuple t = tupleFactory.newTuple(fields.length); // Read the start object marker. Throughout this file if the parsing // isn't what we expect we return a tuple with null fields rather than // throwing an exception. That way a few mangled lines don't fail the // job. if (p.nextToken() != JsonToken.START_OBJECT) { warn("Bad record, could not find start of record " + val.toString(), PigWarning.UDF_WARNING_1); return t; } // Read each field in the record for (int i = 0; i < fields.length; i++) { t.set(i, readField(p, fields[i], i)); } if (p.nextToken() != JsonToken.END_OBJECT) { warn("Bad record, could not find end of record " + val.toString(), PigWarning.UDF_WARNING_1); return t; } p.close(); return t; }
From source file:org.apache.pig.builtin.PigStorage.java
License:Apache License
@Override public Tuple getNext() throws IOException { mProtoTuple = new ArrayList<Object>(); if (!mRequiredColumnsInitialized) { if (signature != null) { Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass()); mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p.getProperty(signature)); }//from ww w . j a v a 2s .c o m mRequiredColumnsInitialized = true; } //Prepend input source path if source tagging is enabled if (tagFile) { mProtoTuple.add(new DataByteArray(sourcePath.getName())); } else if (tagPath) { mProtoTuple.add(new DataByteArray(sourcePath.toString())); } try { boolean notDone = in.nextKeyValue(); if (!notDone) { return null; } Text value = (Text) in.getCurrentValue(); byte[] buf = value.getBytes(); int len = value.getLength(); int start = 0; int fieldID = 0; for (int i = 0; i < len; i++) { if (buf[i] == fieldDel) { if (mRequiredColumns == null || (mRequiredColumns.length > fieldID && mRequiredColumns[fieldID])) addTupleValue(mProtoTuple, buf, start, i); start = i + 1; fieldID++; } } // pick up the last field if (start <= len && (mRequiredColumns == null || (mRequiredColumns.length > fieldID && mRequiredColumns[fieldID]))) { addTupleValue(mProtoTuple, buf, start, len); } Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); return dontLoadSchema ? t : applySchema(t); } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } }
From source file:org.apache.pig.builtin.TextLoader.java
License:Apache License
@Override public Tuple getNext() throws IOException { try {//from w ww . j a v a2s .com boolean notDone = in.nextKeyValue(); if (!notDone) { return null; } Text value = (Text) in.getCurrentValue(); byte[] ba = value.getBytes(); // make a copy of the bytes representing the input since // TextInputFormat will reuse the byte array return mTupleFactory.newTuple(new DataByteArray(ba, 0, value.getLength())); } catch (InterruptedException e) { throw new IOException("Error getting input"); } }
From source file:org.apache.pig.impl.util.StorageUtil.java
License:Apache License
/** * Transform a line of <code>Text</code> to a <code>Tuple</code> * * @param val a line of text//from w w w .j a va2 s . c om * @param fieldDel the field delimiter * @return tuple constructed from the text */ public static Tuple textToTuple(Text val, byte fieldDel) { return bytesToTuple(val.getBytes(), 0, val.getLength(), fieldDel); }
From source file:org.apache.pig.piggybank.storage.CSVExcelStorage.java
License:Apache License
@Override public Tuple getNext() throws IOException { // If SKIP_INPUT_HEADER and this is the first input split, skip header record // We store its value as a string though, so we can compare // further records to it. If they are the same (this would // happen if multiple small files each with a header were combined // into one split), we know to skip the duplicate header record as well. if (loadingFirstRecord && headerTreatment == Headers.SKIP_INPUT_HEADER && (splitIndex == 0 || splitIndex == -1)) { try {//from ww w . ja v a2 s .c o m if (!in.nextKeyValue()) return null; header = ((Text) in.getCurrentValue()).toString(); } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } } loadingFirstRecord = false; mProtoTuple = new ArrayList<Object>(); getNextInQuotedField = false; boolean evenQuotesSeen = true; boolean sawEmbeddedRecordDelimiter = false; byte[] buf = null; if (!mRequiredColumnsInitialized) { if (udfContextSignature != null) { Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass()); mRequiredColumns = (boolean[]) ObjectSerializer.deserialize(p.getProperty(udfContextSignature)); } mRequiredColumnsInitialized = true; } // Note: we cannot factor out the check for nextKeyValue() being null, // because that call overwrites buf with the new line, which is // bad if we have a field with a newline. try { int recordLen = 0; getNextFieldID = 0; while (sawEmbeddedRecordDelimiter || getNextFieldID == 0) { Text value = null; if (sawEmbeddedRecordDelimiter) { // Deal with pulling more records from the input, because // a double quoted embedded newline was encountered in a field. // Save the length of the record so far, plus one byte for the // record delimiter (usually newline) that's embedded in the field // we were working on before falling into this branch: int prevLineLen = recordLen + 1; // Save previous line (the one with the field that has the newline) in a new array. // The last byte will be random; we'll fill in the embedded // record delimiter (usually newline) below: byte[] prevLineSaved = Arrays.copyOf(buf, prevLineLen); prevLineSaved[prevLineLen - 1] = RECORD_DEL; // Read the continuation of the record, unless EOF: if (!in.nextKeyValue()) { return null; } value = (Text) in.getCurrentValue(); recordLen = value.getLength(); // Grab the continuation's bytes: buf = value.getBytes(); // Combine the previous line and the continuation into a new array. // The following copyOf() does half the job: it allocates all the // space, and also copies the previous line into that space: byte[] prevLineAndContinuation = Arrays.copyOf(prevLineSaved, prevLineLen + recordLen); // Now append the continuation. Parms: fromBuf, fromStartPos, toBuf, toStartPos, lengthToCopy: System.arraycopy(buf, 0, prevLineAndContinuation, prevLineLen, recordLen); // We'll work with the combination now: buf = prevLineAndContinuation; // Do the whole record over from the start: mProtoTuple.clear(); getNextInQuotedField = false; evenQuotesSeen = true; getNextFieldID = 0; recordLen = prevLineAndContinuation.length; } else { // Previous record finished cleanly: start with the next record, // unless EOF: if (!in.nextKeyValue()) { return null; } value = (Text) in.getCurrentValue(); // if the line is a duplicate header and 'SKIP_INPUT_HEADER' is set, ignore it // (this might happen if multiple files each with a header are combined into a single split) if (headerTreatment == Headers.SKIP_INPUT_HEADER && value.toString().equals(header)) { if (!in.nextKeyValue()) return null; value = (Text) in.getCurrentValue(); } buf = value.getBytes(); getNextFieldID = 0; recordLen = value.getLength(); } nextTupleSkipChar = false; ByteBuffer fieldBuffer = ByteBuffer.allocate(recordLen); sawEmbeddedRecordDelimiter = processOneInRecord(evenQuotesSeen, buf, recordLen, fieldBuffer); // The last field is never delimited by a FIELD_DEL, but by // the end of the record. So we need to add that last field. // The '!sawEmbeddedRecordDelimiter' handles the case of // embedded newlines; we are amidst a field, not at // the final record: if (!sawEmbeddedRecordDelimiter) readField(fieldBuffer, getNextFieldID++); } // end while } catch (InterruptedException e) { int errCode = 6018; String errMsg = "Error while reading input"; throw new ExecException(errMsg, errCode, PigException.REMOTE_ENVIRONMENT, e); } Tuple t = mTupleFactory.newTupleNoCopy(mProtoTuple); return t; }
From source file:org.apache.rya.accumulo.pig.AccumuloStorage.java
License:Apache License
@Override public void putNext(final Tuple t) throws ExecException, IOException { final Mutation mut = new Mutation(objToText(t.get(0))); final Text cf = objToText(t.get(1)); final Text cq = objToText(t.get(2)); if (t.size() > 4) { final Text cv = objToText(t.get(3)); final Value val = new Value(objToBytes(t.get(4))); if (cv.getLength() == 0) { mut.put(cf, cq, val); } else {/*w w w . j a v a 2 s .com*/ mut.put(cf, cq, new ColumnVisibility(cv), val); } } else { final Value val = new Value(objToBytes(t.get(3))); mut.put(cf, cq, val); } try { writer.write(tableName, mut); } catch (final InterruptedException e) { throw new IOException(e); } }
From source file:org.archive.access.nutch.indexer.WaxIndexingFilter.java
License:LGPL
public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) { if (url == null || url.getLength() <= 0) { LOGGER.error(doc.toString() + " has no url"); return doc; }/* www . j ava 2 s . co m*/ String urlStr = url.toString(); // Stored, indexed and un-tokenized. Date is already GMT so don't // mess w/ timezones. Date is stored as seconds since epoch to // facilitate sorting (The Lucene Sort interprets the IA 14-char // date string as a float; rounding of float values equates floats // that shouldn't equate: e.g: // float f = Float.parseFloat("20050524133833"); // float g = Float.parseFloat("20050524133834"); // float h = Float.parseFloat("20050524133835"); // System.out.println(f == g); // System.out.println(f == h); // ...prints true twice. // So, have seconds since epoch for the date we index. long seconds = datum.getFetchTime() / 1000; if (seconds > Integer.MAX_VALUE) { LOGGER.warn("Fetch time " + Long.toString(seconds) + " is > Integer.MAX_VALUE. Setting to zero"); seconds = 0; } doc.add(new Field(DATE_KEY, ArchiveUtils.zeroPadInteger((int) seconds), Field.Store.YES, Field.Index.UN_TOKENIZED)); // Add as stored, unindexed, and untokenized. Don't warn if absent. // Its not a tradegy. add(urlStr, doc, "encoding", parse.getData().getMeta(ENCODING_KEY), false, true, true, false, false); // Get metadatas. MapWritable mw = datum.getMetaData(); ParseData pd = parse.getData(); // Add as stored, indexed, and untokenized but not lowercased. add(urlStr, doc, ARCCOLLECTION_KEY, getMetadataValue(ARCCOLLECTION_KEY, pd, mw), false, true, true, false); // Add as stored, indexed, and untokenized. Preserve case for // arcname since eventually it will be used to find an arc on // filesystem. add(urlStr, doc, ARCFILENAME_KEY, getMetadataValue(ARCFILENAME_KEY, pd, mw), false, true, true, false); add(urlStr, doc, ARCFILEOFFSET_KEY, getMetadataValue(ARCFILEOFFSET_KEY, pd, mw), false, true, false, false); // This is a nutch 'more' field. add(urlStr, doc, "contentLength", parse.getData().getMeta("contentLength"), false, true, false, false); // Mimetype. The ARC2Segment tool stores the content-type into // metadata with a key of 'content-type'. String mimetype = parse.getData().getMeta(CONTENT_TYPE_KEY); if (mimetype == null || mimetype.length() == 0) { MimeType mt = (MIME.getMimeType(urlStr)); if (mt != null) { mimetype = mt.getName(); } } try { // Test the mimetype makes some sense. If not, don't add. mimetype = (new MimeType(mimetype)).getName(); } catch (MimeTypeException e) { LOGGER.error(urlStr + ", mimetype " + mimetype + ": " + e.toString()); // Clear mimetype because caused exception. mimetype = null; } if (mimetype != null) { // wera wants the sub and primary types in index. So they are // stored but not searchable. nutch adds primary and subtypes // as well as complete type all to one 'type' field. final String type = "type"; add(urlStr, doc, type, mimetype, true, false, true, false); int index = mimetype.indexOf('/'); if (index > 0) { String tmp = mimetype.substring(0, index); add(urlStr, doc, "primaryType", tmp, true, true, false, false); add(urlStr, doc, type, tmp, true, false, true, false); if (index + 1 < mimetype.length()) { tmp = mimetype.substring(index + 1); add(urlStr, doc, "subType", tmp, true, true, false, false); add(urlStr, doc, type, tmp, true, false, true, false); } } } // Add as not lowercased, not stored, indexed, and not tokenized. add(urlStr, doc, EXACTURL_KEY, escapeUrl(url.toString()), false, false, true, false); // TODO MC - for site search try { java.net.URL netUrl = new java.net.URL(urlStr); String reverseDomain = (new StringBuffer(netUrl.getHost())).reverse().toString(); add(urlStr, doc, DOMAIN_KEY, reverseDomain, false, true, true, false); } catch (Exception MalformedURLException) { LOGGER.error("Malformed url " + urlStr + "."); } // TODO MC - for site search return doc; }
From source file:org.archive.nutchwax.ImporterToHdfs.java
License:Apache License
/** * Import an ARCRecord.// ww w. j ava2 s .c om * * @param record * @param segmentName * @param collectionName * @param output * @return whether record was imported or not (i.e. filtered out due to URL * filtering rules, etc.) */ private boolean importRecord(ARCRecord record, String segmentName, String collectionName, OutputCollector output, Writer writer) { ARCRecordMetaData meta = record.getMetaData(); if (LOG.isInfoEnabled()) { LOG.info("Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength() + "]"); } if (!this.httpStatusCodeFilter.isAllowed(record.getStatusCode())) { if (LOG.isInfoEnabled()) { LOG.info("Skip URL: " + meta.getUrl() + " HTTP status:" + record.getStatusCode()); } return false; } try { // Skip the HTTP headers in the response body, so that the // parsers are parsing the reponse body and not the HTTP // headers. record.skipHttpHeader(); // We use record.available() rather than meta.getLength() // because the latter includes the size of the HTTP header, // which we just skipped. byte[] bytes = readBytes(record, record.available()); // If there is no digest, then we assume we're reading an // ARCRecord not a WARCRecord. In that case, we close the // record, which updates the digest string. Then we tweak the // digest string so we have the same for for both ARC and WARC // records. if (meta.getDigest() == null) { record.close(); // This is a bit hacky, but ARC and WARC records produce // two slightly different digest formats. WARC record // digests have the algorithm name as a prefix, such as // "sha1:PD3SS4WWZVFWTDC63RU2MWX7BVC2Y2VA" but the // ArcRecord.getDigestStr() does not. Since we want the // formats to match, we prepend the "sha1:" prefix to ARC // record digest. meta.setDigest("sha1:" + record.getDigestStr()); } // Normalize and filter String url = this.normalizeAndFilterUrl(meta.getUrl(), meta.getDigest(), meta.getDate()); if (url == null) { if (LOG.isInfoEnabled()) { LOG.info("Skip URL: " + meta.getUrl()); } return false; } // We create a key which combines the URL and digest values. // This is necessary because Nutch stores all the data in // MapFiles, which are basically just {key,value} pairs. // // If we use just the URL as the key (which is the way Nutch // usually works) then we have problems with multiple, // different copies of the same URL. If we try and store two // different copies of the same URL (each having a different // digest) and only use the URL as the key, when the MapFile // is written, only *one* copy of the page will be stored. // // Think about it, we're basically doing: // MapFile.put( url, value1 ); // MapFile.put( url, value2 ); // Only one of those url,value mappings will keep, the other // is over-written. // // So, by using the url+digest as the key, we can have all the // data stored. The only problem is all over in Nutch where // the key==url is assumed :( String key = url + " " + meta.getDigest(); Metadata contentMetadata = new Metadata(); // Set the segment name, just as is done by standard Nutch fetching. // Then, add the NutchWAX-specific metadata fields. contentMetadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // We store both the normal URL and the URL+digest key for // later retrieval by the indexing plugin(s). contentMetadata.set(NutchWax.URL_KEY, url); // contentMetadata.set( NutchWax.ORIG_KEY, key ); contentMetadata.set(NutchWax.FILENAME_KEY, meta.getArcFile().getName()); contentMetadata.set(NutchWax.FILEOFFSET_KEY, String.valueOf(record.getHeader().getOffset())); contentMetadata.set(NutchWax.COLLECTION_KEY, collectionName); contentMetadata.set(NutchWax.DATE_KEY, meta.getDate()); contentMetadata.set(NutchWax.DIGEST_KEY, meta.getDigest()); contentMetadata.set(NutchWax.CONTENT_TYPE_KEY, meta.getMimetype()); contentMetadata.set(NutchWax.CONTENT_LENGTH_KEY, String.valueOf(meta.getLength())); contentMetadata.set(NutchWax.HTTP_RESPONSE_KEY, String.valueOf(record.getStatusCode())); Content content = new Content(url, url, bytes, meta.getMimetype(), contentMetadata, getConf()); // ----------------- // write to seqencefile byte[] contentInOctets = content.getContent(); String htmlraw = new String(); // meta only contains char encodings // LOG.info("Metadata count: " + contentMetadata.names().length); // for (String name : contentMetadata.names()){ // LOG.info("meta " + name + " : " + contentMetadata.get(name)); // } // try getting content encoding try { htmlraw = new String(contentInOctets, contentMetadata.get("OriginalCharEncoding")); } catch (Exception e) { LOG.warn("could not get content with OriginalCharEncoding"); } // if unable, try utf-8 if (htmlraw.length() == 0) { try { htmlraw = new String(contentInOctets, "UTF-8"); } catch (UnsupportedEncodingException e) { LOG.error("unable to convert content into string"); } } URL url_h = null; try { url_h = new URL(content.getUrl()); } catch (MalformedURLException e1) { LOG.error("Malformed URL Exception: " + e1.getMessage()); } String protocol = url_h.getProtocol(); String hostname = url_h.getHost(); String urlpath = url_h.getPath(); String param = url_h.getQuery(); //LOG.info("HOST:" + hostname); //LOG.info("PATH:" + urlpath); //LOG.info("PROTOCOL:" + protocol); //LOG.info("PARAM: " + param); String date = meta.getDate(); // LOG.info("meta date: " + date); Text key_h = new Text(protocol + "::" + hostname + "::" + urlpath + "::" + param + "::" + date); Text value = new Text(htmlraw); try { LOG.info("len: " + writer.getLength() + ", key: " + key_h + ", value len: " + value.getLength()); writer.append(key_h, value); } catch (IOException e) { LOG.error("SequenceFile IOException: " + e.getMessage()); } // ----------------- output(output, new Text(key), content); return true; } catch (Throwable t) { LOG.error("Import fail : " + meta.getUrl(), t); } return false; }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Position the input stream at the start of the first record. * * @param stream The stream to reposition. *///from w w w . j ava2 s .co m protected final int positionAtFirstRecord(final FSDataInputStream stream, final CompressionCodec codec) throws IOException { Text buffer = new Text(); long originalStart = start; LineReader reader; if (codec == null) { // Advance to the start of the first record that ends with /1 // We use a temporary LineReader to read lines until we find the // position of the right one. We then seek the file to that position. stream.seek(start); reader = new LineReader(stream); } else { // Unlike the codec == null case, we don't seek before creating the // reader, SplittableCompressionCodec.createInputStream places the // stream at the start of the first compression block after our // split start // // as noted above, we need to be at pos 0 in the stream before // calling this reader = new LineReader(((SplittableCompressionCodec) codec).createInputStream(stream, null, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK)); } int bytesRead = 0; do { bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start)); int bufferLength = buffer.getLength(); if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) { start += bytesRead; } else { // line starts with @. Read two more and verify that it starts // with a +: // // @<readname> // <sequence> // +[readname] // // if the second line we read starts with a @, we know that // we've read: // // <qualities> <-- @ is a valid ASCII phred encoding // @<readname> // // and thus, the second read is the delimiter and we can break long trackForwardPosition = start + bytesRead; bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start)); if (buffer.getLength() > 0 && buffer.getBytes()[0] == '@') { start = trackForwardPosition; break; } else { trackForwardPosition += bytesRead; } bytesRead = reader.readLine(buffer, (int) Math.min(maxLineLength, end - start)); trackForwardPosition += bytesRead; if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') { break; // all good! } else { start = trackForwardPosition; } } } while (bytesRead > 0); pos = start; start = originalStart; stream.seek(start); return (int) (pos - originalStart); }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Parses a read from an interleaved FASTQ file. * * Only reads a single record.//from ww w .jav a 2 s. c om * * @param readName Text record containing read name. Output parameter. * @param value Text record containing full record. Output parameter. * @return Returns true if read was successful (did not hit EOF). * * @throws RuntimeException Throws exception if FASTQ record doesn't * have proper formatting (e.g., record doesn't start with @). */ protected final boolean lowLevelFastqRead(final Text readName, final Text value) throws IOException { if (endOfCompressedSplit) { return false; } // ID line readName.clear(); long skipped = appendLineInto(readName, true); if (skipped == 0) { return false; // EOF } if (readName.getBytes()[0] != '@') { throw new RuntimeException("unexpected fastq record didn't start with '@' at " + makePositionMessage() + ". Line: " + readName + ". \n"); } value.append(readName.getBytes(), 0, readName.getLength()); // sequence appendLineInto(value, false); // separator line appendLineInto(value, false); // quality appendLineInto(value, false); return true; }