Example usage for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString()

Source Link

Document

Convert text back to string

Usage

From source file:com.hyperiongray.ccmr.s3wordcount.WordCountOnlyMapper.java

License:Apache License

public void map(Object key, Text value, OutputCollector<NullWritable, Text> outputCollector, Reporter reporter)
        throws IOException {

    // We're accessing a publicly available bucket so don't need to fill in
    // our credentials
    ArchiveReader ar;/*w ww.  j  a v  a 2 s.  com*/
    try {
        S3Service s3s = new RestS3Service(null);

        // Let's grab a file out of the CommonCrawl S3 bucket
        String fn = value.toString();
        logger.info(fn);

        S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null);

        // The file name identifies the ArchiveReader and indicates if it
        // should be decompressed
        ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true);

    } catch (ServiceException e) {
        logger.error("S3 connection Failed", e);
        throw new RuntimeException(e);
    }

    // Once we have an ArchiveReader, we can work through each of the
    // records it contains
    int i = 0;
    logger.info("Started" + new Date());
    for (ArchiveRecord r : ar) {

        reporter.progress();
        String url = "";
        try {

            // The header file contains information such as the type of
            // record, size, creation time, and URL
            url = r.getHeader().getUrl();
            String crawledDate = r.getHeader().getDate();
            if (url == null)
                continue;

            // If we want to read the contents of the record, we can use the
            // ArchiveRecord as an InputStream
            // Create a byte array that is as long as all the record's
            // stated length
            OutputStream os = new ByteArrayOutputStream();
            try {
                r.dump(os);
            } finally {
                try {
                    if (r != null)
                        r.close();
                } catch (Exception e) {
                    logger.error("reading inputstream Failed", e);
                }
            }
            // Note: potential optimization would be to have a large buffer
            // only allocated once

            // Why don't we convert it to a string and print the start of
            // it?
            String content = new String(os.toString());

            Map<String, Integer> matches = contentMatcher.matchContent(content);
            int score = contentMatcher.score(matches);

            if (score > LOWER_SCORE_THRESHOLD) {

                logger.info("****************************************");
                logger.info("URL: " + url + " Score: " + score + " Detail: " + matches);
                // outputCollector.collect(new IntWritable(score), new Text(url));
                outputCollector.collect(NullWritable.get(), new Text(outputParser
                        .parse(contentMatcher.getTitle(content), url, crawledDate, score, matches)));
            }

            logger.debug(new Integer(i).toString());

            if (i++ > sampleSize) {
                logger.info("Finished " + new Date());
                break;
            }

        } catch (Exception e) {
            logger.error("url failed " + url, e);
        }
    }

}

From source file:com.hyperiongray.ccmr.s3wordcount.WordCountReducer.java

License:Apache License

public void reduce(Text url, Iterator<IntWritable> scores,
        OutputCollector<Text, Map<Text, IntWritable>> results, Reporter reporter) throws IOException {

    String domainName;//from www  .j  a va 2 s  . c  o  m
    try {
        domainName = this.getDomainName(url.toString());
        Map<Text, IntWritable> map = Maps.newHashMap();
        int acum = 0;

        //         Integer score = Integer.valueOf(scores.get("score").toString());
        while (scores.hasNext()) {
            acum += scores.next().get();
        }
        //         results.collect(new Text(domainName), new IntWritable(acum));

        map.put(new Text(url), new IntWritable(acum));
        results.collect(new Text(domainName), map);

    } catch (URISyntaxException e) {
        e.printStackTrace();
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java

License:Open Source License

/**
 * //from w w w .  ja  va 2  s .  c  om
 * @param fname
 * @param fnameStaging
 * @param fnameNew
 * @param brlen
 * @param bclen
 * @throws DMLRuntimeException
 */
private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen,
        int brlen, int bclen) throws DMLRuntimeException {
    long row = -1;
    long col = -1;

    try {
        //STEP 1: read matrix from HDFS and write blocks to local staging area         
        //check and add input path
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fname);
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);

        LinkedList<Cell> buffer = new LinkedList<Cell>();
        LongWritable key = new LongWritable();
        Text value = new Text();
        FastStringTokenizer st = new FastStringTokenizer(' ');

        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                while (reader.next(key, value)) {
                    st.reset(value.toString()); //reset tokenizer
                    row = st.nextLong();
                    col = st.nextLong();
                    double lvalue = st.nextDouble();
                    Cell tmp = new Cell(row, col, lvalue);

                    buffer.addLast(tmp);
                    if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                    {
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                }

                //final flush
                if (!buffer.isEmpty()) {
                    appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                    buffer.clear();
                }
            } finally {
                if (reader != null)
                    reader.close();
            }
        }

        //STEP 2: read matrix blocks from staging area and write matrix to HDFS
        String[] fnamesPartitions = new File(fnameStaging).list();
        if (PARALLEL) {
            int len = Math.min(fnamesPartitions.length, _par);
            Thread[] threads = new Thread[len];
            for (int i = 0; i < len; i++) {
                int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                end = Math.min(end, fnamesPartitions.length - 1);
                threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging,
                        fnamesPartitions, start, end));
                threads[i].start();
            }

            for (Thread t : threads)
                t.join();
        } else {
            for (String pdir : fnamesPartitions)
                writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
        }
    } catch (Exception e) {
        //post-mortem error handling and bounds checking
        if (row < 1 || row > rlen || col < 1 || col > clen) {
            throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else
            throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteDPParForMR.java

License:Open Source License

/**
 * Result file contains hierarchy of workerID-resultvar(incl filename). We deduplicate
 * on the workerID. Without JVM reuse each task refers to a unique workerID, so we
 * will not find any duplicates. With JVM reuse, however, each slot refers to a workerID, 
 * and there are duplicate filenames due to partial aggregation and overwrite of fname 
 * (the RemoteParWorkerMapper ensures uniqueness of those files independent of the 
 * runtime implementation). /* w  ww .  ja  v  a  2 s .c om*/
 * 
 * @param job 
 * @param fname
 * @return
 * @throws DMLRuntimeException
 */
@SuppressWarnings("deprecation")
public static LocalVariableMap[] readResultFile(JobConf job, String fname)
        throws DMLRuntimeException, IOException {
    HashMap<Long, LocalVariableMap> tmp = new HashMap<Long, LocalVariableMap>();

    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);
    LongWritable key = new LongWritable(); //workerID
    Text value = new Text(); //serialized var header (incl filename)

    int countAll = 0;
    for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) {
        SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(job), lpath, job);
        try {
            while (reader.next(key, value)) {
                //System.out.println("key="+key.get()+", value="+value.toString());
                if (!tmp.containsKey(key.get()))
                    tmp.put(key.get(), new LocalVariableMap());
                Object[] dat = ProgramConverter.parseDataObject(value.toString());
                tmp.get(key.get()).put((String) dat[0], (Data) dat[1]);
                countAll++;
            }
        } finally {
            if (reader != null)
                reader.close();
        }
    }

    LOG.debug("Num remote worker results (before deduplication): " + countAll);
    LOG.debug("Num remote worker results: " + tmp.size());

    //create return array
    return tmp.values().toArray(new LocalVariableMap[0]);
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForColocatedFileSplit.java

License:Open Source License

/**
 * Get the list of hostnames where the input split is located.
 *//*w  w w  .j  av  a2s  . c om*/
@Override
public String[] getLocations() throws IOException {
    //Timing time = new Timing();
    //time.start();

    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);

    //read task string
    LongWritable key = new LongWritable();
    Text value = new Text();
    RecordReader<LongWritable, Text> reader = new NLineInputFormat().getRecordReader(this, job, Reporter.NULL);
    reader.next(key, value);
    reader.close();

    //parse task
    Task t = Task.parseCompactString(value.toString());

    //get all locations
    HashMap<String, Integer> hosts = new HashMap<String, Integer>();

    if (t.getType() == TaskType.SET) {
        for (IntObject val : t.getIterations()) {
            String fname = _fname + "/" + String.valueOf(((val.getLongValue() - 1) / _blen + 1));
            FileStatus status = fs.getFileStatus(new Path(fname));
            BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
            for (BlockLocation bl : tmp1)
                countHosts(hosts, bl.getHosts());
        }
    } else //TaskType.RANGE
    {
        //since this is a serial process, we use just the first iteration
        //as a heuristic for location information
        long lFrom = t.getIterations().get(0).getLongValue();
        long lTo = t.getIterations().get(1).getLongValue();
        for (long li : new long[] { lFrom, lTo }) {
            String fname = _fname + "/" + String.valueOf(((li - 1) / _blen + 1));
            FileStatus status = fs.getFileStatus(new Path(fname));
            BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
            for (BlockLocation bl : tmp1)
                countHosts(hosts, bl.getHosts());
        }

        /*
        int lFrom  = t.getIterations().get(0).getIntValue();
        int lTo    = t.getIterations().get(1).getIntValue();
        int lIncr  = t.getIterations().get(2).getIntValue();            
        for( int i=lFrom; i<=lTo; i+=lIncr )
        {
           String fname = _fname+"/"+String.valueOf( ((i-_offset)/_blen+_offset) );
           FileSystem fs = FileSystem.get(job);
           FileStatus status = fs.getFileStatus(new Path(fname)); 
           BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
           for( BlockLocation bl : tmp1 )
              countHosts(hosts, bl.getHosts());
        }*/
    }

    //System.out.println("Get locations "+time.stop()+"");

    //majority consensus on top host
    return getTopHosts(hosts);
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParWorkerMapper.java

License:Open Source License

/**
 * //w  w w .  j av  a 2s.  c  o m
 */
@Override
public void map(LongWritable key, Text value, OutputCollector<Writable, Writable> out, Reporter reporter)
        throws IOException {
    LOG.trace("execute RemoteParWorkerMapper " + _stringID + " (" + _workerID + ")");

    //state for jvm reuse and multiple iterations 
    long numIters = getExecutedIterations();

    try {
        //parse input task
        Task lTask = Task.parseCompactString(value.toString());

        //execute task (on error: re-try via Hadoop)
        executeTask(lTask);

        //write output if required (matrix indexed write)
        RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars, _rvarFnames, out);
    } catch (Exception ex) {
        //throw IO exception to adhere to API specification
        throw new IOException("ParFOR: Failed to execute task.", ex);
    }

    //statistic maintenance
    RemoteParForUtils.incrementParForMRCounters(reporter, 1, getExecutedIterations() - numIters);

    //print heaver hitter per task
    JobConf job = ConfigurationManager.getCachedJobConf();
    if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode(job))
        LOG.info("\nSystemML Statistics:\nHeavy hitter instructions (name, time, count):\n"
                + Statistics.getHeavyHitters(10));
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * //from  w  w w .j a v a  2 s .c  om
 * @param fnameNew
 * @param outMo
 * @param inMO
 * @throws DMLRuntimeException
 */
private void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO)
        throws DMLRuntimeException {
    try {
        //delete target file if already exists
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);

        if (ALLOW_COPY_CELLFILES) {
            copyAllFiles(fnameNew, inMO);
            return; //we're done
        }

        //actual merge
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = FileSystem.get(job);
        Path path = new Path(fnameNew);
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

        String valueStr = null;

        try {
            for (MatrixObject in : inMO) //read/write all inputs
            {
                LOG.trace("ResultMerge (local, file): Merge input " + in.getVarName() + " (fname="
                        + in.getFileName() + ") via stream merge");

                JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
                Path tmpPath = new Path(in.getFileName());
                FileInputFormat.addInputPath(tmpJob, tmpPath);
                TextInputFormat informat = new TextInputFormat();
                informat.configure(tmpJob);
                InputSplit[] splits = informat.getSplits(tmpJob, 1);

                LongWritable key = new LongWritable();
                Text value = new Text();

                for (InputSplit split : splits) {
                    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob,
                            Reporter.NULL);
                    try {
                        while (reader.next(key, value)) {
                            valueStr = value.toString().trim();
                            out.write(valueStr + "\n");
                        }
                    } finally {
                        if (reader != null)
                            reader.close();
                    }
                }
            }
        } finally {
            if (out != null)
                out.close();
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException("Unable to merge text cell results.", ex);
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * /* w w w . jav a 2  s  . c o m*/
 * @param fnameStaging
 * @param mo
 * @param ID
 * @throws IOException
 * @throws DMLRuntimeException
 */

private void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID)
        throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(mo.getFileName());
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);

    LinkedList<Cell> buffer = new LinkedList<Cell>();
    LongWritable key = new LongWritable();
    Text value = new Text();

    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    //long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
    //NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
    // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
    // It works fine with int row, col but we require long for larger matrices.
    // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
    // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)

    FastStringTokenizer st = new FastStringTokenizer(' ');

    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
        try {
            while (reader.next(key, value)) {
                st.reset(value.toString()); //reset tokenizer
                long row = st.nextLong();
                long col = st.nextLong();
                double lvalue = Double.parseDouble(st.nextToken());

                Cell tmp = new Cell(row, col, lvalue);

                buffer.addLast(tmp);
                if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                {
                    appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                    buffer.clear();
                }
            }

            //final flush
            if (!buffer.isEmpty()) {
                appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                buffer.clear();
            }
        } finally {
            if (reader != null)
                reader.close();
        }
    }
}

From source file:com.ibm.bi.dml.runtime.instructions.spark.functions.ConvertTextToString.java

License:Open Source License

@Override
public String call(Text arg0) throws Exception {
    return arg0.toString();
}

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCell.java

License:Open Source License

/**
 * /*from w ww .  j a  v  a2s . co  m*/
 * @param path
 * @param job
 * @param dest
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws IOException
 * @throws IllegalAccessException
 * @throws InstantiationException
 */
private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen,
        int brlen, int bclen) throws IOException {
    boolean sparse = dest.isInSparseFormat();
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);

    LongWritable key = new LongWritable();
    Text value = new Text();
    int row = -1;
    int col = -1;

    try {
        FastStringTokenizer st = new FastStringTokenizer(' ');

        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);

            try {
                if (sparse) //SPARSE<-value
                {
                    while (reader.next(key, value)) {
                        st.reset(value.toString()); //reinit tokenizer
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        double lvalue = st.nextDouble();
                        dest.appendValue(row, col, lvalue);
                    }

                    dest.sortSparseRows();
                } else //DENSE<-value
                {
                    while (reader.next(key, value)) {
                        st.reset(value.toString()); //reinit tokenizer
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        double lvalue = st.nextDouble();
                        dest.setValueDenseUnsafe(row, col, lvalue);
                    }
                }
            } finally {
                if (reader != null)
                    reader.close();
            }
        }
    } catch (Exception ex) {
        //post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) {
            throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else {
            throw new IOException("Unable to read matrix in text cell format.", ex);
        }
    }
}