Example usage for org.apache.hadoop.io Text getLength

List of usage examples for org.apache.hadoop.io Text getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getLength.

Prototype

@Override
public int getLength() 

Source Link

Document

Returns the number of bytes in the byte array

Usage

From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java

License:Open Source License

public static OGCGeometry consumeGeometryESRI(Text text, char separator) {
    // Check whether this text is a Well Known Text (WKT) or a hexed string
    boolean wkt = false;
    byte[] bytes = text.getBytes();
    int length = text.getLength();
    int i_shape = 0;
    while (!wkt && i_shape < ShapeNames.length) {
        byte[] shapeName = ShapeNames[i_shape];
        if (length > shapeName.length) {
            int i = 0;
            while (i < shapeName.length && shapeName[i] == bytes[i])
                i++;/*w w w  .  j  a v a  2s.  c o m*/
            if (i == shapeName.length) {
                wkt = true;
                break;
            }
        }
        i_shape++;
    }

    // Look for the terminator of the shape text
    int i1 = 0;
    if (bytes[i1] == '\'' || bytes[i1] == '\"') {
        separator = (char) bytes[i1++];
    }
    int i2 = i1;
    while (i2 < length && bytes[i2] != separator)
        i2++;

    String str = new String(bytes, i1, i2 - i1);

    // Remove consumed bytes from the text
    text.set(bytes, i2, text.getLength() - i2);

    OGCGeometry geom = parseText(str);

    return geom;
}

From source file:edu.umn.cs.spatialHadoop.io.TextSerializerHelper.java

License:Open Source License

public static synchronized Geometry consumeGeometryJTS(Text text, char separator) {
    // Check whether this text is a Well Known Text (WKT) or a hexed string
    boolean wkt = false;
    byte[] bytes = text.getBytes();
    int length = text.getLength();
    Geometry geom;//from   w  w  w .  ja  va2 s  .co  m
    int i1, i2; // Start and end offset of the geometry being parsed
    int i_next; // Beginning of the next field
    boolean isWKT = false;
    boolean isHex = false;
    if (bytes[0] == '\'' || bytes[0] == '\"') {
        // A quoted string. Find terminating quote and trim the quotes
        i1 = 1;
        i2 = 2;
        while (i2 < length && bytes[i2] != bytes[0])
            i2++;
        if (i2 == length)
            throw new RuntimeException("Unterminated quoted string");
        i_next = i2 + 1;
        i2--; // Back one step to remove the terminating quote
        isWKT = true; // Assume any quoted string to be WKT
    } else {
        // Not a quoted string, check if the type is WKT
        int i_shape = 0;
        while (!wkt && i_shape < ShapeNames.length) {
            byte[] shapeName = ShapeNames[i_shape];
            if (length > shapeName.length) {
                int i = 0;
                while (i < shapeName.length && shapeName[i] == bytes[i])
                    i++;
                if (i == shapeName.length) {
                    wkt = true;
                    break;
                }
            }
            i_shape++;
        }

        if (i_shape < ShapeNames.length) {
            isWKT = true;
            // Look for the terminator of the shape text
            i1 = 0;
            i2 = 1;
            // Search for the first open parenthesis
            while (i2 < length && bytes[i2] != '(')
                i2++;
            if (i2 < length)
                i2++; // Skip the open parenthesis itself
            int nesting = 1;
            while (i2 < length && nesting > 0) {
                if (bytes[i2] == '(')
                    nesting++;
                else if (bytes[i2] == ')')
                    nesting--;
                i2++;
            }
            i_next = i2 + 1;
        } else {
            // Check if the type is hex-encoded WKB
            i1 = 0;
            i2 = 0;
            while (i2 < length && IsHex[bytes[i2]])
                i2++;
            isHex = i2 > 1;
            i_next = i2;
        }
    }

    String geom_text = new String(bytes, i1, i2);

    try {
        if (isWKT) {
            geom = wktReader.read(geom_text);
        } else if (isHex) {
            byte[] binary = hexToBytes(geom_text);
            geom = wkbReader.read(binary);
        } else {
            geom = null;
        }
    } catch (ParseException e) {
        throw new RuntimeException(String.format("Error parsing '%s'", geom_text), e);
    }

    // Remove consumed bytes from the text
    if (i_next >= text.getLength())
        text.clear();
    else {
        if (bytes[i_next] == separator)
            i_next++;
        text.set(bytes, i_next, length - i_next);
    }

    return geom;
}

From source file:edu.umn.cs.spatialHadoop.mapred.SpatialRecordReader.java

License:Open Source License

/**
 * Reads the next line from input and return true if a line was read.
 * If no more lines are available in this split, a false is returned.
 * @param value//from w  w w .j a va 2s  . c  om
 * @return
 * @throws IOException
 */
protected boolean nextLine(Text value) throws IOException {
    if (blockType == BlockType.RTREE && pos == 8) {
        // File is positioned at the RTree header
        // Skip the header and go to first data object in file
        pos += RTree.skipHeader(in);
        LOG.info("Skipped R-tree to position: " + pos);
        // Reinitialize record reader at the new position
        lineReader = new LineReader(in);
    }
    while (getFilePosition() <= end) {
        value.clear();
        int b = 0;
        if (buffer != null) {
            // Read the first line encountered in buffer
            int eol = RTree.skipToEOL(buffer, 0);
            b += eol;
            value.append(buffer, 0, eol);
            if (eol < buffer.length) {
                // There are still some bytes remaining in buffer
                byte[] tmp = new byte[buffer.length - eol];
                System.arraycopy(buffer, eol, tmp, 0, tmp.length);
                buffer = tmp;
            } else {
                buffer = null;
            }
            // Check if a complete line has been read from the buffer
            byte last_byte = value.getBytes()[value.getLength() - 1];
            if (last_byte == '\n' || last_byte == '\r')
                return true;
        }

        // Read the first line from stream
        Text temp = new Text();
        b += lineReader.readLine(temp);
        if (b == 0) {
            // Indicates an end of stream
            return false;
        }
        pos += b;

        // Append the part read from stream to the part extracted from buffer
        value.append(temp.getBytes(), 0, temp.getLength());

        if (value.getLength() > 1) {
            // Read a non-empty line. Note that end-of-line character is included
            return true;
        }
    }
    // Reached end of file
    return false;
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialRecordReader3.java

License:Open Source License

/**
 * Reads the next line from input and return true if a line was read.
 * If no more lines are available in this split, a false is returned.
 * @param value The text object to fill with the next line
 * @return <code>true</code> if a line was read; <code>false</code> otherwise.
 * @throws IOException If an error occurs while reading from disk.
 *//*from w w  w .j  av a  2 s.  c  om*/
protected boolean nextLine(Text value) throws IOException {
    while (getPos() <= end) {
        value.clear();

        int lineLength;
        // Read the first line from stream
        if ((lineLength = lineReader.readLine(value)) <= 0) {
            // Indicates an end of stream
            return false;
        }

        // Append the part read from stream to the part extracted from buffer
        bytesRead += lineLength;

        if (value.getLength() > 1) {
            // Read a non-empty line. Note that end-of-line character is included
            return true;
        }
    }
    // Reached end of file
    return false;
}

From source file:edu.umn.cs.spatialHadoop.nasa.NASAPoint.java

License:Open Source License

@Override
public void fromText(Text text) {
    super.fromText(text);
    byte[] bytes = text.getBytes();
    text.set(bytes, 1, text.getLength() - 1);
    value = TextSerializerHelper.consumeInt(text, ',');
    timestamp = TextSerializerHelper.consumeLong(text, '\0');
}

From source file:edu.umn.cs.spatialHadoop.operations.FileMBR.java

License:Open Source License

public static Partition fileMBRLocal(Path[] inFiles, final OperationsParams params)
        throws IOException, InterruptedException {
    // 1- Split the input path/file to get splits that can be processed independently
    final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>();
    Job job = Job.getInstance(params);/*from w  w w .j a  v  a 2 s . com*/
    SpatialInputFormat3.setInputPaths(job, inFiles);
    final List<org.apache.hadoop.mapreduce.InputSplit> splits = inputFormat.getSplits(job);
    int parallelism = params.getInt("parallel", Runtime.getRuntime().availableProcessors());

    // 2- Process splits in parallel
    List<Map<String, Partition>> allMbrs = Parallel.forEach(splits.size(),
            new RunnableRange<Map<String, Partition>>() {
                @Override
                public Map<String, Partition> run(int i1, int i2) {
                    Map<String, Partition> mbrs = new HashMap<String, Partition>();
                    for (int i = i1; i < i2; i++) {
                        try {
                            org.apache.hadoop.mapreduce.lib.input.FileSplit fsplit = (org.apache.hadoop.mapreduce.lib.input.FileSplit) splits
                                    .get(i);
                            final RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat
                                    .createRecordReader(fsplit, null);
                            if (reader instanceof SpatialRecordReader3) {
                                ((SpatialRecordReader3) reader).initialize(fsplit, params);
                            } else if (reader instanceof RTreeRecordReader3) {
                                ((RTreeRecordReader3) reader).initialize(fsplit, params);
                            } else if (reader instanceof HDFRecordReader) {
                                ((HDFRecordReader) reader).initialize(fsplit, params);
                            } else {
                                throw new RuntimeException("Unknown record reader");
                            }
                            Partition p = mbrs.get(fsplit.getPath().getName());
                            if (p == null) {
                                p = new Partition();
                                p.filename = fsplit.getPath().getName();
                                p.cellId = p.filename.hashCode();
                                p.size = 0;
                                p.recordCount = 0;
                                p.set(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
                                mbrs.put(p.filename, p);
                            }
                            Text temp = new Text2();
                            while (reader.nextKeyValue()) {
                                Iterable<Shape> shapes = reader.getCurrentValue();
                                for (Shape s : shapes) {
                                    Rectangle mbr = s.getMBR();
                                    if (mbr != null)
                                        p.expand(mbr);
                                    p.recordCount++;
                                    temp.clear();
                                    s.toText(temp);
                                    p.size += temp.getLength() + 1;
                                }
                            }
                        } catch (IOException e) {
                            throw new RuntimeException(e);
                        } catch (InterruptedException e) {
                            throw new RuntimeException(e);
                        }
                    }
                    return mbrs;
                }
            }, parallelism);
    Map<String, Partition> mbrs = allMbrs.remove(allMbrs.size() - 1);
    for (Map<String, Partition> list : allMbrs) {
        for (Partition p1 : list.values()) {
            Partition p2 = mbrs.get(p1.filename);
            if (p2 != null) {
                p2.expand(p1);
            } else {
                mbrs.put(p1.filename, p1);
            }
        }
    }

    // Cache the final result, if needed
    for (Path inFile : inFiles) {
        FileSystem inFs = inFile.getFileSystem(params);
        if (!inFs.getFileStatus(inFile).isDir())
            continue;
        Path gindex_path = new Path(inFile, "_master.heap");
        // Answer has been already cached (may be by another job)
        if (inFs.exists(gindex_path))
            continue;
        FileStatus[] files = inFs.listStatus(inFile, SpatialSite.NonHiddenFileFilter);
        PrintStream wktout = new PrintStream(inFs.create(new Path(inFile, "_heap.wkt"), false));
        PrintStream gout = new PrintStream(inFs.create(gindex_path, false));

        Text text = new Text2();
        for (FileStatus file : files) {
            text.clear();
            Partition p = mbrs.get(file.getPath().getName());
            gout.println(p.toText(text).toString());
            wktout.println(p.toWKT());
        }

        wktout.close();
        gout.close();
    }

    // Return the final answer
    Partition finalResult = new Partition();
    finalResult.size = finalResult.recordCount = 0;
    finalResult.x1 = finalResult.y1 = Double.MAX_VALUE;
    finalResult.x2 = finalResult.y2 = -Double.MAX_VALUE;
    for (Partition p2 : mbrs.values())
        finalResult.expand(p2);
    return finalResult;
}

From source file:edu.umn.cs.spatialHadoop.operations.LocalSampler.java

License:Open Source License

/**
 * Read from the given stream until end-of-line is reached.
 * @param in - the input stream from where to read the line
 * @param line - the line that has been read from file not including EOL
 * @return - number of bytes read including EOL characters
 * @throws IOException /*  w ww .ja  va2 s.c  o  m*/
 */
public static int readUntilEOL(InputStream in, Text line) throws IOException {
    final byte[] bufferBytes = new byte[1024];
    int bufferLength = 0; // Length of the buffer
    do {
        if (bufferLength == bufferBytes.length) {
            // Buffer full. Copy to the output text
            line.append(bufferBytes, 0, bufferLength);
            bufferLength = 0;
        }
        if (bufferLength == 0) {
            // Read and skip any initial EOL characters
            do {
                bufferBytes[0] = (byte) in.read();
            } while (bufferBytes[0] != -1 && (bufferBytes[0] == '\n' || bufferBytes[0] == '\r'));
            if (bufferBytes[0] != -1)
                bufferLength++;
        } else {
            bufferBytes[bufferLength++] = (byte) in.read();
        }
    } while (bufferLength > 0 && bufferBytes[bufferLength - 1] != -1 && bufferBytes[bufferLength - 1] != '\n'
            && bufferBytes[bufferLength - 1] != '\r');
    if (bufferLength > 0) {
        bufferLength--;
        line.append(bufferBytes, 0, bufferLength);
    }
    return line.getLength();
}

From source file:edu.umn.cs.spatialHadoop.operations.Sampler.java

License:Open Source License

private static <T extends TextSerializable> int sampleLocalByCount(Path[] files, ResultCollector<T> output,
        OperationsParams params) throws IOException {

    ArrayList<Path> data_files = new ArrayList<Path>();
    for (Path file : files) {
        FileSystem fs = file.getFileSystem(params);
        if (fs.getFileStatus(file).isDir()) {
            // Directory, process all data files in this directory (visible files)
            FileStatus[] fileStatus = fs.listStatus(file, hiddenFileFilter);
            for (FileStatus f : fileStatus) {
                data_files.add(f.getPath());
            }/*from   w  ww  .  jav  a  2  s. co m*/
        } else {
            // File, process this file
            data_files.add(file);
        }
    }

    files = data_files.toArray(new Path[data_files.size()]);

    TextSerializable inObj1, outObj1;
    inObj1 = OperationsParams.getTextSerializable(params, "shape", new Text2());
    outObj1 = OperationsParams.getTextSerializable(params, "outshape", new Text2());

    // Make the objects final to be able to use in the anonymous inner class
    final TextSerializable inObj = inObj1;
    final T outObj = (T) outObj1;

    ResultCollector<TextSerializable> converter = createConverter(output, inObj, outObj);
    long[] files_start_offset = new long[files.length + 1]; // Prefix sum of files sizes
    long total_length = 0;
    for (int i_file = 0; i_file < files.length; i_file++) {
        FileSystem fs = files[i_file].getFileSystem(params);
        files_start_offset[i_file] = total_length;
        total_length += fs.getFileStatus(files[i_file]).getLen();
    }
    files_start_offset[files.length] = total_length;

    // Generate offsets to read from and make sure they are ordered to minimize
    // seeks between different HDFS blocks
    Random random = new Random(params.getLong("seed", System.currentTimeMillis()));
    long[] offsets = new long[params.getInt("count", 0)];
    for (int i = 0; i < offsets.length; i++) {
        if (total_length == 0)
            offsets[i] = 0;
        else
            offsets[i] = Math.abs(random.nextLong()) % total_length;
    }
    Arrays.sort(offsets);

    int record_i = 0; // Number of records read so far
    int records_returned = 0;

    int file_i = 0; // Index of the current file being sampled
    while (record_i < offsets.length) {
        // Skip to the file that contains the next sample
        while (offsets[record_i] > files_start_offset[file_i + 1])
            file_i++;

        long current_file_size = files_start_offset[file_i + 1] - files_start_offset[file_i];
        FileSystem fs = files[file_i].getFileSystem(params);
        ShapeLineRecordReader reader = new ShapeLineRecordReader(fs.getConf(),
                new FileSplit(files[file_i], 0, current_file_size, new String[] {}));
        Rectangle key = reader.createKey();
        Text line = reader.createValue();
        long pos = files_start_offset[file_i];

        while (record_i < offsets.length && offsets[record_i] <= files_start_offset[file_i + 1]
                && reader.next(key, line)) {
            pos += line.getLength();
            if (pos > offsets[record_i]) {
                // Passed the offset of record_i
                // Report this element to output
                if (converter != null) {
                    inObj.fromText(line);
                    converter.collect(inObj);
                }
                record_i++;
                records_returned++;
            }
        }
        reader.close();

        // Skip any remaining records that were supposed to be read from this file
        // This case might happen if a generated random position was in the middle
        // of the last line.
        while (record_i < offsets.length && offsets[record_i] <= files_start_offset[file_i + 1])
            record_i++;
    }
    return records_returned;
}

From source file:edu.umn.cs.spatialHadoop.osm.OSMPoint.java

License:Open Source License

@Override
public void fromText(Text text) {
    id = TextSerializerHelper.consumeLong(text, '\t');
    x = TextSerializerHelper.consumeDouble(text, '\t');
    y = TextSerializerHelper.consumeDouble(text, '\t');
    if (text.getLength() > 0)
        TextSerializerHelper.consumeMap(text, tags);
}

From source file:es.pic.astro.hadoop.io.BinaryOutputFormat.java

License:Apache License

/**
 * create the final out file, and output row by row. After one row is
 * appended, a configured row separator is appended
 *
 * @param jc//  www .  ja  v a  2  s .  c om
 *          the job configuration file
 * @param outPath
 *          the final output file to be created
 * @param valueClass
 *          the value class used for create
 * @param isCompressed
 *          whether the content is compressed or not
 * @param tableProperties
 *          the tableProperties of this file's corresponding table
 * @param progress
 *          progress used for status report
 * @return the RecordWriter
 */
@Override
public RecordWriter getHiveRecordWriter(JobConf jc, Path outPath, Class<? extends Writable> valueClass,
        boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException {

    FileSystem fs = outPath.getFileSystem(jc);
    final OutputStream outStream = Utilities.createCompressedStream(jc, fs.create(outPath, progress),
            isCompressed);
    return new RecordWriter() {
        @Override
        public void write(Writable r) throws IOException {
            if (r instanceof Text) {
                Text tr = (Text) r;
                outStream.write(tr.getBytes(), 0, tr.getLength());
            } else {
                // DynamicSerDe always writes out BytesWritable
                BytesWritable bw = (BytesWritable) r;
                outStream.write(bw.get(), 0, bw.getSize());
            }
        }

        @Override
        public void close(boolean abort) throws IOException {
            outStream.close();
        }
    };
}