List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:com.hyperiongray.ccmr.s3wordcount.WordCountOnlyMapper.java
License:Apache License
public void map(Object key, Text value, OutputCollector<NullWritable, Text> outputCollector, Reporter reporter) throws IOException { // We're accessing a publicly available bucket so don't need to fill in // our credentials ArchiveReader ar;/*w ww. j a v a 2 s. com*/ try { S3Service s3s = new RestS3Service(null); // Let's grab a file out of the CommonCrawl S3 bucket String fn = value.toString(); logger.info(fn); S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null); // The file name identifies the ArchiveReader and indicates if it // should be decompressed ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true); } catch (ServiceException e) { logger.error("S3 connection Failed", e); throw new RuntimeException(e); } // Once we have an ArchiveReader, we can work through each of the // records it contains int i = 0; logger.info("Started" + new Date()); for (ArchiveRecord r : ar) { reporter.progress(); String url = ""; try { // The header file contains information such as the type of // record, size, creation time, and URL url = r.getHeader().getUrl(); String crawledDate = r.getHeader().getDate(); if (url == null) continue; // If we want to read the contents of the record, we can use the // ArchiveRecord as an InputStream // Create a byte array that is as long as all the record's // stated length OutputStream os = new ByteArrayOutputStream(); try { r.dump(os); } finally { try { if (r != null) r.close(); } catch (Exception e) { logger.error("reading inputstream Failed", e); } } // Note: potential optimization would be to have a large buffer // only allocated once // Why don't we convert it to a string and print the start of // it? String content = new String(os.toString()); Map<String, Integer> matches = contentMatcher.matchContent(content); int score = contentMatcher.score(matches); if (score > LOWER_SCORE_THRESHOLD) { logger.info("****************************************"); logger.info("URL: " + url + " Score: " + score + " Detail: " + matches); // outputCollector.collect(new IntWritable(score), new Text(url)); outputCollector.collect(NullWritable.get(), new Text(outputParser .parse(contentMatcher.getTitle(content), url, crawledDate, score, matches))); } logger.debug(new Integer(i).toString()); if (i++ > sampleSize) { logger.info("Finished " + new Date()); break; } } catch (Exception e) { logger.error("url failed " + url, e); } } }
From source file:com.hyperiongray.ccmr.s3wordcount.WordCountReducer.java
License:Apache License
public void reduce(Text url, Iterator<IntWritable> scores, OutputCollector<Text, Map<Text, IntWritable>> results, Reporter reporter) throws IOException { String domainName;//from www .j a va 2 s . c o m try { domainName = this.getDomainName(url.toString()); Map<Text, IntWritable> map = Maps.newHashMap(); int acum = 0; // Integer score = Integer.valueOf(scores.get("score").toString()); while (scores.hasNext()) { acum += scores.next().get(); } // results.collect(new Text(domainName), new IntWritable(acum)); map.put(new Text(url), new IntWritable(acum)); results.collect(new Text(domainName), map); } catch (URISyntaxException e) { e.printStackTrace(); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java
License:Open Source License
/** * //from w w w . ja va 2 s . c om * @param fname * @param fnameStaging * @param fnameNew * @param brlen * @param bclen * @throws DMLRuntimeException */ private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { long row = -1; long col = -1; try { //STEP 1: read matrix from HDFS and write blocks to local staging area //check and add input path JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LinkedList<Cell> buffer = new LinkedList<Cell>(); LongWritable key = new LongWritable(); Text value = new Text(); FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { while (reader.next(key, value)) { st.reset(value.toString()); //reset tokenizer row = st.nextLong(); col = st.nextLong(); double lvalue = st.nextDouble(); Cell tmp = new Cell(row, col, lvalue); buffer.addLast(tmp); if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush { appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } //final flush if (!buffer.isEmpty()) { appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } finally { if (reader != null) reader.close(); } } //STEP 2: read matrix blocks from staging area and write matrix to HDFS String[] fnamesPartitions = new File(fnameStaging).list(); if (PARALLEL) { int len = Math.min(fnamesPartitions.length, _par); Thread[] threads = new Thread[len]; for (int i = 0; i < len; i++) { int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len); int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1; end = Math.min(end, fnamesPartitions.length - 1); threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end)); threads[i].start(); } for (Thread t : threads) t.join(); } else { for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir); } } catch (Exception e) { //post-mortem error handling and bounds checking if (row < 1 || row > rlen || col < 1 || col > clen) { throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } else throw new DMLRuntimeException("Unable to partition text cell matrix.", e); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteDPParForMR.java
License:Open Source License
/** * Result file contains hierarchy of workerID-resultvar(incl filename). We deduplicate * on the workerID. Without JVM reuse each task refers to a unique workerID, so we * will not find any duplicates. With JVM reuse, however, each slot refers to a workerID, * and there are duplicate filenames due to partial aggregation and overwrite of fname * (the RemoteParWorkerMapper ensures uniqueness of those files independent of the * runtime implementation). /* w ww . ja v a 2 s .c om*/ * * @param job * @param fname * @return * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") public static LocalVariableMap[] readResultFile(JobConf job, String fname) throws DMLRuntimeException, IOException { HashMap<Long, LocalVariableMap> tmp = new HashMap<Long, LocalVariableMap>(); FileSystem fs = FileSystem.get(job); Path path = new Path(fname); LongWritable key = new LongWritable(); //workerID Text value = new Text(); //serialized var header (incl filename) int countAll = 0; for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) { SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(job), lpath, job); try { while (reader.next(key, value)) { //System.out.println("key="+key.get()+", value="+value.toString()); if (!tmp.containsKey(key.get())) tmp.put(key.get(), new LocalVariableMap()); Object[] dat = ProgramConverter.parseDataObject(value.toString()); tmp.get(key.get()).put((String) dat[0], (Data) dat[1]); countAll++; } } finally { if (reader != null) reader.close(); } } LOG.debug("Num remote worker results (before deduplication): " + countAll); LOG.debug("Num remote worker results: " + tmp.size()); //create return array return tmp.values().toArray(new LocalVariableMap[0]); }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParForColocatedFileSplit.java
License:Open Source License
/** * Get the list of hostnames where the input split is located. *//*w w w .j av a2s . c om*/ @Override public String[] getLocations() throws IOException { //Timing time = new Timing(); //time.start(); JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); //read task string LongWritable key = new LongWritable(); Text value = new Text(); RecordReader<LongWritable, Text> reader = new NLineInputFormat().getRecordReader(this, job, Reporter.NULL); reader.next(key, value); reader.close(); //parse task Task t = Task.parseCompactString(value.toString()); //get all locations HashMap<String, Integer> hosts = new HashMap<String, Integer>(); if (t.getType() == TaskType.SET) { for (IntObject val : t.getIterations()) { String fname = _fname + "/" + String.valueOf(((val.getLongValue() - 1) / _blen + 1)); FileStatus status = fs.getFileStatus(new Path(fname)); BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen()); for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts()); } } else //TaskType.RANGE { //since this is a serial process, we use just the first iteration //as a heuristic for location information long lFrom = t.getIterations().get(0).getLongValue(); long lTo = t.getIterations().get(1).getLongValue(); for (long li : new long[] { lFrom, lTo }) { String fname = _fname + "/" + String.valueOf(((li - 1) / _blen + 1)); FileStatus status = fs.getFileStatus(new Path(fname)); BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen()); for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts()); } /* int lFrom = t.getIterations().get(0).getIntValue(); int lTo = t.getIterations().get(1).getIntValue(); int lIncr = t.getIterations().get(2).getIntValue(); for( int i=lFrom; i<=lTo; i+=lIncr ) { String fname = _fname+"/"+String.valueOf( ((i-_offset)/_blen+_offset) ); FileSystem fs = FileSystem.get(job); FileStatus status = fs.getFileStatus(new Path(fname)); BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen()); for( BlockLocation bl : tmp1 ) countHosts(hosts, bl.getHosts()); }*/ } //System.out.println("Get locations "+time.stop()+""); //majority consensus on top host return getTopHosts(hosts); }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.RemoteParWorkerMapper.java
License:Open Source License
/** * //w w w . j av a 2s. c o m */ @Override public void map(LongWritable key, Text value, OutputCollector<Writable, Writable> out, Reporter reporter) throws IOException { LOG.trace("execute RemoteParWorkerMapper " + _stringID + " (" + _workerID + ")"); //state for jvm reuse and multiple iterations long numIters = getExecutedIterations(); try { //parse input task Task lTask = Task.parseCompactString(value.toString()); //execute task (on error: re-try via Hadoop) executeTask(lTask); //write output if required (matrix indexed write) RemoteParForUtils.exportResultVariables(_workerID, _ec.getVariables(), _resultVars, _rvarFnames, out); } catch (Exception ex) { //throw IO exception to adhere to API specification throw new IOException("ParFOR: Failed to execute task.", ex); } //statistic maintenance RemoteParForUtils.incrementParForMRCounters(reporter, 1, getExecutedIterations() - numIters); //print heaver hitter per task JobConf job = ConfigurationManager.getCachedJobConf(); if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode(job)) LOG.info("\nSystemML Statistics:\nHeavy hitter instructions (name, time, count):\n" + Statistics.getHeavyHitters(10)); }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * //from w w w .j a v a 2 s .c om * @param fnameNew * @param outMo * @param inMO * @throws DMLRuntimeException */ private void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) throws DMLRuntimeException { try { //delete target file if already exists MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); if (ALLOW_COPY_CELLFILES) { copyAllFiles(fnameNew, inMO); return; //we're done } //actual merge JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path(fnameNew); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true))); String valueStr = null; try { for (MatrixObject in : inMO) //read/write all inputs { LOG.trace("ResultMerge (local, file): Merge input " + in.getVarName() + " (fname=" + in.getFileName() + ") via stream merge"); JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf()); Path tmpPath = new Path(in.getFileName()); FileInputFormat.addInputPath(tmpJob, tmpPath); TextInputFormat informat = new TextInputFormat(); informat.configure(tmpJob); InputSplit[] splits = informat.getSplits(tmpJob, 1); LongWritable key = new LongWritable(); Text value = new Text(); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL); try { while (reader.next(key, value)) { valueStr = value.toString().trim(); out.write(valueStr + "\n"); } } finally { if (reader != null) reader.close(); } } } } finally { if (out != null) out.close(); } } catch (Exception ex) { throw new DMLRuntimeException("Unable to merge text cell results.", ex); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * /* w w w . jav a 2 s . c o m*/ * @param fnameStaging * @param mo * @param ID * @throws IOException * @throws DMLRuntimeException */ private void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException { JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(mo.getFileName()); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LinkedList<Cell> buffer = new LinkedList<Cell>(); LongWritable key = new LongWritable(); Text value = new Text(); MatrixCharacteristics mc = mo.getMatrixCharacteristics(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); //long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively //NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit. // It works fine with int row, col but we require long for larger matrices. // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell) // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0) FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { while (reader.next(key, value)) { st.reset(value.toString()); //reset tokenizer long row = st.nextLong(); long col = st.nextLong(); double lvalue = Double.parseDouble(st.nextToken()); Cell tmp = new Cell(row, col, lvalue); buffer.addLast(tmp); if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush { appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen); buffer.clear(); } } //final flush if (!buffer.isEmpty()) { appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen); buffer.clear(); } } finally { if (reader != null) reader.close(); } } }
From source file:com.ibm.bi.dml.runtime.instructions.spark.functions.ConvertTextToString.java
License:Open Source License
@Override public String call(Text arg0) throws Exception { return arg0.toString(); }
From source file:com.ibm.bi.dml.runtime.io.ReaderTextCell.java
License:Open Source License
/** * /*from w ww . j a v a2s . co m*/ * @param path * @param job * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException { boolean sparse = dest.isInSparseFormat(); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LongWritable key = new LongWritable(); Text value = new Text(); int row = -1; int col = -1; try { FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { if (sparse) //SPARSE<-value { while (reader.next(key, value)) { st.reset(value.toString()); //reinit tokenizer row = st.nextInt() - 1; col = st.nextInt() - 1; double lvalue = st.nextDouble(); dest.appendValue(row, col, lvalue); } dest.sortSparseRows(); } else //DENSE<-value { while (reader.next(key, value)) { st.reset(value.toString()); //reinit tokenizer row = st.nextInt() - 1; col = st.nextInt() - 1; double lvalue = st.nextDouble(); dest.setValueDenseUnsafe(row, col, lvalue); } } } finally { if (reader != null) reader.close(); } } } catch (Exception ex) { //post-mortem error handling and bounds checking if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) { throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } else { throw new IOException("Unable to read matrix in text cell format.", ex); } } }