Example usage for org.apache.hadoop.io Text toString

List of usage examples for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString() 

Source Link

Document

Convert text back to string

Usage

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSVParallel.java

License:Open Source License

/**
 * //from  w  w  w  . ja  va2  s  .  c  om
 * @param path
 * @param job
 * @param hasHeader
 * @param delim
 * @return
 * @throws IOException
 * @throws DMLRuntimeException 
 */
private MatrixBlock computeCSVSizeAndCreateOutputMatrixBlock(InputSplit[] splits, Path path, JobConf job,
        boolean hasHeader, String delim, long estnnz) throws IOException, DMLRuntimeException {
    int nrow = 0;
    int ncol = 0;

    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);

    // count no of entities in the first non-header row
    LongWritable key = new LongWritable();
    Text oneLine = new Text();
    RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[0], job, Reporter.NULL);
    try {
        if (reader.next(key, oneLine)) {
            String cellStr = oneLine.toString().trim();
            ncol = StringUtils.countMatches(cellStr, delim) + 1;
        }
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }

    // count rows in parallel per split
    try {
        ExecutorService pool = Executors.newFixedThreadPool(_numThreads);
        ArrayList<CountRowsTask> tasks = new ArrayList<CountRowsTask>();
        for (InputSplit split : splits) {
            tasks.add(new CountRowsTask(split, informat, job, hasHeader));
            hasHeader = false;
        }
        pool.invokeAll(tasks);
        pool.shutdown();

        // collect row counts for offset computation
        // early error notify in case not all tasks successful
        _offsets = new SplitOffsetInfos(tasks.size());
        for (CountRowsTask rt : tasks) {
            if (!rt.getReturnCode())
                throw new IOException("Count task for csv input failed: " + rt.getErrMsg());
            _offsets.setOffsetPerSplit(tasks.indexOf(rt), nrow);
            _offsets.setLenghtPerSplit(tasks.indexOf(rt), rt.getRowCount());
            nrow = nrow + rt.getRowCount();
        }
    } catch (Exception e) {
        throw new IOException("Threadpool Error " + e.getMessage(), e);
    }

    // allocate target matrix block based on given size; 
    // need to allocate sparse as well since lock-free insert into target
    return createOutputMatrixBlock(nrow, ncol, estnnz, true, true);
}

From source file:com.ibm.bi.dml.runtime.matrix.data.TextCellToRowBlockConverter.java

License:Open Source License

@Override
public void convert(LongWritable k1, Text v1) {

    String str = v1.toString();

    //handle support for matrix market format
    if (str.startsWith("%")) {
        if (str.startsWith("%%"))
            toIgnore = true;//from   www.ja  v a2  s . com
        hasValue = false;
        return;
    } else if (toIgnore) {
        toIgnore = false;
        hasValue = false;
        return;
    }

    //reset the tokenizer
    st.reset(str);

    //convert text to row block
    indexes.setIndexes(st.nextLong(), st.nextLong());
    rowBlock.reset(1, 1);
    rowBlock.quickSetValue(0, 0, st.nextDouble());
    hasValue = true;
}

From source file:com.ibm.bi.dml.runtime.matrix.data.TextToBinaryCellConverter.java

License:Open Source License

@Override
public void convert(LongWritable k1, Text v1) {
    String str = v1.toString();

    //handle support for matrix market format
    if (str.startsWith("%")) {
        if (str.startsWith("%%"))
            toIgnore = true;/* w w w .ja  v a 2 s . com*/
        hasValue = false;
        return;
    } else if (toIgnore) {
        toIgnore = false;
        hasValue = false;
        return;
    }

    //reset the tokenizer
    st.reset(str);

    //convert text to matrix cell
    indexes.setIndexes(st.nextLong(), st.nextLong());
    value.setValue(st.nextDouble());
    hasValue = true;
}

From source file:com.ibm.bi.dml.runtime.matrix.mapred.CSVAssignRowIDMapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text value, OutputCollector<ByteWritable, OffsetCount> out, Reporter report)
        throws IOException {
    if (first) {// w w w. j a va2s  . c o m
        first = false;
        fileOffset = key.get();
        outCache = out;
    }

    if (key.get() == 0 && headerFile)//getting the number of colums
    {
        if (!ignoreFirstLine) {
            report.incrCounter(CSVReblockMR.NUM_COLS_IN_MATRIX, outKey.toString(),
                    value.toString().split(delim, -1).length);
            if (!omit(value.toString()))
                num++;
        } else
            realFirstLine = true;
    } else {
        if (realFirstLine) {
            report.incrCounter(CSVReblockMR.NUM_COLS_IN_MATRIX, outKey.toString(),
                    value.toString().split(delim, -1).length);
            realFirstLine = false;
        }
        if (!omit(value.toString()))
            num++;
    }
}

From source file:com.ibm.bi.dml.runtime.matrix.mapred.CSVReblockMapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text value, OutputCollector<TaggedFirstSecondIndexes, BlockRow> out,
        Reporter reporter) throws IOException {
    if (first) {/*from w  ww .  j  ava2s .  c  o  m*/
        rowOffset = offsetMap.get(key.get());
        first = false;
    }

    if (key.get() == 0 && headerFile && ignoreFirstLine)
        return;

    String[] cells = IOUtilFunctions.split(value.toString(), _delim);

    for (int i = 0; i < representativeMatrixes.size(); i++)
        for (CSVReblockInstruction ins : csv_reblock_instructions.get(i)) {
            idxRow = processRow(idxRow, cells, rowOffset, num, ins.output, ins.brlen, ins.bclen, ins.fill,
                    ins.fillValue, out);
        }

    num++;
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfBBMapper.java

License:Open Source License

@Override
public void map(LongWritable rawKey, Text rawValue,
        OutputCollector<TaggedFirstSecondIndexes, CSVReblockMR.BlockRow> out, Reporter reporter)
        throws IOException {

    if (_first) {
        rowOffset = offsetMap.get(rawKey.get());
        _reporter = reporter;/*w w  w .  j ava2s  .c  o  m*/
        _first = false;
    }

    // output the header line
    if (rawKey.get() == 0 && _partFileWithHeader) {
        tfmapper.processHeaderLine();
        if (tfmapper.hasHeader())
            return;
    }

    // parse the input line and apply transformation
    String[] words = tfmapper.getWords(rawValue);

    if (!tfmapper.omit(words)) {
        words = tfmapper.apply(words);
        try {
            tfmapper.check(words);

            // Perform CSV Reblock
            CSVReblockInstruction ins = csv_reblock_instructions.get(0).get(0);
            idxRow = CSVReblockMapper.processRow(idxRow, words, rowOffset, num, ins.output, ins.brlen,
                    ins.bclen, ins.fill, ins.fillValue, out);
        } catch (DMLRuntimeException e) {
            throw new RuntimeException(e.getMessage() + ":" + rawValue.toString());
        }
        num++;
    }
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMapper.java

License:Open Source License

@Override
public void map(LongWritable rawKey, Text rawValue, OutputCollector<NullWritable, Text> out, Reporter reporter)
        throws IOException {

    if (_firstRecordInSplit) {
        _firstRecordInSplit = false;//w  w  w  .  jav  a  2s  .co  m
        _reporter = reporter;

        // generate custom output paths so that order of rows in the 
        // output (across part files) matches w/ that from input data set
        String partFileSuffix = tfmapper.getPartFileID(_rJob, rawKey.get());
        Path mapOutputPath = new Path(tfmapper.getOutputPath() + "/transform-part-" + partFileSuffix);

        // setup the writer for mapper's output
        // the default part-..... files will be deleted later once the job finishes 
        br = new BufferedWriter(new OutputStreamWriter(FileSystem.get(_rJob).create(mapOutputPath, true)));
    }

    // output the header line
    if (rawKey.get() == 0 && _partFileWithHeader) {
        _reporter = reporter;
        tfmapper.processHeaderLine();
        if (tfmapper.hasHeader())
            return;
    }

    // parse the input line and apply transformation
    String[] words = tfmapper.getWords(rawValue);

    if (!tfmapper.omit(words)) {
        try {
            words = tfmapper.apply(words);
            String outStr = tfmapper.checkAndPrepOutputString(words);
            //out.collect(NullWritable.get(), new Text(outStr));
            br.write(outStr + "\n");
        } catch (DMLRuntimeException e) {
            throw new RuntimeException(e.getMessage() + ": " + rawValue.toString());
        }
    }
}

From source file:com.ibm.bi.dml.runtime.transform.ApplyTfHelper.java

License:Open Source License

public String[] getWords(Text line) {
    return _delim.split(line.toString(), -1);
}

From source file:com.ibm.bi.dml.runtime.transform.GTFMTDMapper.java

License:Open Source License

public void map(LongWritable rawKey, Text rawValue, OutputCollector<IntWritable, DistinctValue> out,
        Reporter reporter) throws IOException {

    if (_firstRecordInSplit) {
        _firstRecordInSplit = false;//  www .j a va 2 s .  c o m
        _collector = out;
        _offsetInPartFile = rawKey.get();
    }

    // ignore header
    if (_agents.hasHeader() && rawKey.get() == 0 && _partFileWithHeader)
        return;

    _agents.prepareTfMtd(rawValue.toString());
}

From source file:com.ibm.bi.dml.runtime.transform.TfUtils.java

License:Open Source License

public String[] getWords(Text line) {
    return getWords(line.toString());
}