Example usage for org.apache.hadoop.io Text Text

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text Text.

Prototype

public Text()

Source Link

Usage

From source file:boa.datagen.SeqRepoImporter.java

License:Apache License

private static void getProcessedProjects() throws IOException {
    FileStatus[] files = fileSystem.listStatus(new Path(base + "tmprepcache/2015-08"));
    String hostname = InetAddress.getLocalHost().getHostName();
    for (int i = 0; i < files.length; i++) {
        FileStatus file = files[i];/*w  ww  .  j  ava  2 s .  c o m*/
        String prefix = "projects-" + hostname + "-";
        String name = file.getPath().getName();
        int index1 = name.indexOf(prefix);
        if (index1 > -1) {
            try {
                SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, file.getPath(), conf);
                final Text key = new Text();
                while (r.next(key)) {
                    processedProjectIds.add(key.toString());
                }
                r.close();
            } catch (EOFException e) {
                printError(e, "EOF Exception in " + file.getPath().getName());
                fileSystem.delete(file.getPath(), false);
            }
        }
    }
    System.out.println("Got processed projects: " + processedProjectIds.size());
}

From source file:boostingPL.boosting.InstancesHelper.java

License:Open Source License

/**
 * create instances header from metadata,
 * the metadata like this://from   w w  w  .ja  va 2 s  .  c  om
 * 
 *   <br/>
 *   <p>attributesNum:100</p>
 *   <p>classes:+1,-1</p>
 *   <br/>
 * 
 * @param in
 * @return
 * @throws IOException
 */
public static Instances createInstancesFromMetadata(LineReader in) throws IOException {
    int attributesNum = 0;
    ArrayList<Attribute> attInfo = null;
    List<String> classItems = null;

    Text line = new Text();
    while (in.readLine(line) > 0) {
        String sline = line.toString();
        if (sline.startsWith("attributesNum:")) {
            attributesNum = Integer.parseInt(sline.substring(14));
            attInfo = new ArrayList<Attribute>(attributesNum + 1);
            for (int i = 0; i < attributesNum; i++) {
                attInfo.add(new Attribute("attr" + i));
            }

            System.out.println("AttributeNum:" + attributesNum);
        } else if (sline.startsWith("classes:")) {
            String classes = sline.substring(8);
            String[] citems = classes.split(",");
            classItems = new ArrayList<String>(citems.length);
            for (String s : citems) {
                classItems.add(s);
            }

            System.out.println("classes:" + classes);
        }
    }

    attInfo.add(new Attribute("class", classItems));
    Instances insts = new Instances("BoostingPL-dataset", attInfo, 0);
    insts.setClassIndex(insts.numAttributes() - 1);

    return insts;
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Position the input stream at the start of the first record.
 *
 * @param stream The stream to reposition.
 *///from   www .  j a  v  a2s .com
protected void positionAtFirstRecord(FSDataInputStream stream) throws IOException {
    Text buffer = new Text();

    if (true) { // (start > 0) // use start>0 to assume that files start with valid data
        // Advance to the start of the first record that ends with /1
        // We use a temporary LineReader to read lines until we find the
        // position of the right one.  We then seek the file to that position.
        stream.seek(start);
        LineReader reader = new LineReader(stream);

        int bytesRead = 0;
        do {
            bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
            int bufferLength = buffer.getLength();
            if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) {
                start += bytesRead;
            } else {
                // line starts with @.  Read two more and verify that it starts with a +
                //
                // If this isn't the start of a record, we want to backtrack to its end
                long backtrackPosition = start + bytesRead;

                bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') {
                    break; // all good!
                } else {
                    // backtrack to the end of the record we thought was the start.
                    start = backtrackPosition;
                    stream.seek(start);
                    reader = new LineReader(stream);
                }
            }
        } while (bytesRead > 0);

        stream.seek(start);
    }

    pos = start;
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Seeks ahead in our split to the next key-value pair.
 *
 * Triggers the read of an interleaved FASTQ read pair, and populates
 * internal state./*from  ww  w.j ava2 s .  c  o m*/
 *
 * @return True if reading the next read pair succeeded.
 */
public boolean nextKeyValue() throws IOException, InterruptedException {
    currentValue = new Text();

    return next(currentValue);
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Reads a newline into a text record from the underlying line reader.
 *
 * @param dest Text record to read line into.
 * @param eofOk Whether an EOF is acceptable in this line.
 * @return Returns the number of bytes read.
 *
 * @throws EOFException Throws if eofOk was false and we hit an EOF in
 *    the current line./*from ww  w.  j av a  2 s. c o  m*/
 */
private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException {
    Text buf = new Text();
    int bytesRead = lineReader.readLine(buf, MAX_LINE_LENGTH);

    if (bytesRead < 0 || (bytesRead == 0 && !eofOk))
        throw new EOFException();

    dest.append(buf.getBytes(), 0, buf.getLength());
    dest.append(newline, 0, 1);
    pos += bytesRead;

    return bytesRead;
}

From source file:bucket_sort.NLineInputFormat.java

License:Apache License

public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit)
        throws IOException {
    List<FileSplit> splits = new ArrayList<FileSplit>();
    Path fileName = status.getPath();
    if (status.isDir()) {
        throw new IOException("Not a file: " + fileName);
    }/*from  ww  w  . j a v  a2  s .c  o m*/
    FileSystem fs = fileName.getFileSystem(conf);
    LineReader lr = null;
    try {
        FSDataInputStream in = fs.open(fileName);
        lr = new LineReader(in, conf);
        Text line = new Text();
        int numLines = 0;
        long begin = 0;
        long length = 0;
        int num = -1;
        while ((num = lr.readLine(line)) > 0) {
            numLines++;
            length += num;
            if (numLines == numLinesPerSplit) {
                // NLineInputFormat uses LineRecordReader, which always reads
                // (and consumes) at least one character out of its upper split
                // boundary. So to make sure that each mapper gets N lines, we
                // move back the upper split limits of each split 
                // by one character here.
                if (begin == 0) {
                    splits.add(new FileSplit(fileName, begin, length - 1, new String[] {}));
                } else {
                    splits.add(new FileSplit(fileName, begin - 1, length, new String[] {}));
                }
                begin += length;
                length = 0;
                numLines = 0;
            }
        }
        if (numLines != 0) {
            splits.add(new FileSplit(fileName, begin, length, new String[] {}));
        }
    } finally {
        if (lr != null) {
            lr.close();
        }
    }
    return splits;
}

From source file:ca.uwaterloo.iss4e.hadoop.io.CartesianRecordReader.java

License:Open Source License

public CartesianRecordReader(CompositeInputSplit split, TaskAttemptContext taskAttemptContext)
        throws IOException {

    this.leftIS = split.get(0);
    this.rightIS = split.get(1);
    this.rightTaskAttemptContext = taskAttemptContext;
    this.key = new Text();
    this.value = new Text();
    Configuration conf = rightTaskAttemptContext.getConfiguration();
    try {/*from  w ww.j  a  va  2 s.c o  m*/
        // Create left record reader
        FileInputFormat leftFIF = (FileInputFormat) ReflectionUtils
                .newInstance(Class.forName(conf.get(CartesianInputFormat.LEFT_INPUT_FORMAT)), conf);

        leftRR = leftFIF.createRecordReader(leftIS, taskAttemptContext);

        // Create right record reader
        rightFIF = (FileInputFormat) ReflectionUtils
                .newInstance(Class.forName(conf.get(CartesianInputFormat.RIGHT_INPUT_FORMAT)), conf);
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
        throw new IOException(e);
    } catch (InterruptedException e) {
        e.printStackTrace();
        throw new IOException(e);
    }
}

From source file:cascading.scheme.hadoop.TextDelimited.java

License:Open Source License

@Override
public void sinkPrepare(FlowProcess<? extends Configuration> flowProcess,
        SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
    sinkCall.setContext(new Object[3]);

    sinkCall.getContext()[0] = new Text();
    sinkCall.getContext()[1] = new StringBuilder(4 * 1024);
    sinkCall.getContext()[2] = Charset.forName(charsetName);

    if (writeHeader)
        writeHeader(sinkCall);/*from  www  .j av a  2s  . c o  m*/
}

From source file:cascading.scheme.hadoop.TextLine.java

License:Open Source License

@Override
public void sinkPrepare(FlowProcess<? extends Configuration> flowProcess,
        SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
    sinkCall.setContext(new Object[2]);

    sinkCall.getContext()[0] = new Text();
    sinkCall.getContext()[1] = Charset.forName(charsetName);
}

From source file:cascading.tap.hadoop.ZipInputFormatTest.java

License:Open Source License

public void testSplits() throws Exception {
    JobConf job = new JobConf();
    FileSystem currentFs = FileSystem.get(job);

    Path file = new Path(workDir, "test.zip");

    Reporter reporter = Reporter.NULL;//w  w  w.  ja  v  a2 s . c om

    int seed = new Random().nextInt();
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    FileInputFormat.setInputPaths(job, file);

    for (int entries = 1; entries < MAX_ENTRIES; entries += random.nextInt(MAX_ENTRIES / 10) + 1) {
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        ZipOutputStream zos = new ZipOutputStream(byteArrayOutputStream);
        long length = 0;

        LOG.debug("creating; zip file with entries = " + entries);

        // for each entry in the zip file
        for (int entryCounter = 0; entryCounter < entries; entryCounter++) {
            // construct zip entries splitting MAX_LENGTH between entries
            long entryLength = MAX_LENGTH / entries;
            ZipEntry zipEntry = new ZipEntry("/entry" + entryCounter + ".txt");
            zipEntry.setMethod(ZipEntry.DEFLATED);
            zos.putNextEntry(zipEntry);

            for (length = entryCounter * entryLength; length < (entryCounter + 1) * entryLength; length++) {
                zos.write(Long.toString(length).getBytes());
                zos.write("\n".getBytes());
            }

            zos.flush();
            zos.closeEntry();
        }

        zos.flush();
        zos.close();

        currentFs.delete(file, true);

        OutputStream outputStream = currentFs.create(file);

        byteArrayOutputStream.writeTo(outputStream);
        outputStream.close();

        ZipInputFormat format = new ZipInputFormat();
        format.configure(job);
        LongWritable key = new LongWritable();
        Text value = new Text();
        InputSplit[] splits = format.getSplits(job, 100);

        BitSet bits = new BitSet((int) length);
        for (int j = 0; j < splits.length; j++) {
            LOG.debug("split[" + j + "]= " + splits[j]);
            RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter);

            try {
                int count = 0;

                while (reader.next(key, value)) {
                    int v = Integer.parseInt(value.toString());
                    LOG.debug("read " + v);

                    if (bits.get(v))
                        LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos());

                    assertFalse("key in multiple partitions.", bits.get(v));
                    bits.set(v);
                    count++;
                }

                LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count);
            } finally {
                reader.close();
            }
        }

        assertEquals("some keys in no partition.", length, bits.cardinality());
    }
}