List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:com.googlecode.hive.serde.FixedLengthAndDelimitedSerde.java
License:Apache License
@Override public Object deserialize(Writable blob) throws SerDeException { if (inputFormatString == null) { throw new SerDeException("This table does not have serde property \"" + INPUT_FORMAT_STRING + "\"!"); }//from w ww . j a v a2 s . com Text rowText = (Text) blob; Map<Integer, String> columnValues = getColumnValues(rowText.toString()); // If do not match, ignore the line, return a row with all nulls. if (columnValues.keySet().size() != numColumns) { unmatchedRows++; if (unmatchedRows >= nextUnmatchedRows) { nextUnmatchedRows = getNextNumberToDisplay(nextUnmatchedRows); // Report the row LOG.warn("" + unmatchedRows + " unmatched rows are found: " + rowText); } return null; } // Otherwise, return the row. for (int c = 0; c < numColumns; c++) { try { row.set(c, columnValues.get(c)); } catch (RuntimeException e) { partialMatchedRows++; if (partialMatchedRows >= nextPartialMatchedRows) { nextPartialMatchedRows = getNextNumberToDisplay(nextPartialMatchedRows); // Report the row LOG.warn("" + partialMatchedRows + " partially unmatched rows are found, " + " cannot find column number " + c + ": " + rowText); } row.set(c, null); } } return row; }
From source file:com.gsvic.csmr.CSMRMapper.java
License:Apache License
@Override public void map(Text key, VectorWritable value, Context context) throws IOException, InterruptedException { DocumentWritable p = new DocumentWritable(new Text(key.toString()), new VectorWritable(value.get())); context.write(new IntWritable(1), p); }
From source file:com.gsvic.csmr.DocumentWritable.java
License:Apache License
public DocumentWritable(Text key, VectorWritable value) { this.key = new Text(key.toString()); this.value = new VectorWritable(value.get()); }
From source file:com.gsvic.csmr.io.InputData.java
License:Apache License
/** * Reads a Vectorized Text File, tfidf vectors/tf vectors * @param conf//from w ww . j av a 2 s. c o m * @param input * @return Returns the vectorized text file in a HashMap * @throws IOException */ public static HashMap<Text, VectorWritable> vectorizedTextReader(Configuration conf, Path input) throws IOException { FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader; reader = new SequenceFile.Reader(fs, input, conf); HashMap<Text, VectorWritable> dcf = new HashMap<>(); Text key = new Text(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { dcf.put(new Text(key.toString()), new VectorWritable(value.get())); } return dcf; }
From source file:com.hadoop.examples.anagrams.AnagramReducer.java
License:Apache License
public void reduce(Text anagramKey, Iterator<Text> anagramValues, OutputCollector<Text, Text> results, Reporter reporter) throws IOException { String output = ""; while (anagramValues.hasNext()) { Text anagam = anagramValues.next(); output = output + anagam.toString() + "~"; }/*ww w . j a va 2s . c o m*/ StringTokenizer outputTokenizer = new StringTokenizer(output, "~"); if (outputTokenizer.countTokens() >= 2) { output = output.replace("~", ","); outputKey.set(anagramKey.toString()); outputValue.set(output); results.collect(outputKey, outputValue); } }
From source file:com.hadoop.examples.geolocation.GeoLocationMapper.java
License:Apache License
public void map(LongWritable key, Text value, OutputCollector<Text, Text> outputCollector, Reporter reporter) throws IOException { String dataRow = value.toString(); // since these are tab seperated files lets tokenize on tab StringTokenizer dataTokenizer = new StringTokenizer(dataRow, "\t"); String articleName = dataTokenizer.nextToken(); String pointType = dataTokenizer.nextToken(); String geoPoint = dataTokenizer.nextToken(); // we know that this data row is a GEO RSS type point. if (GEO_RSS_URI.equals(pointType)) { // now we process the GEO point data. StringTokenizer st = new StringTokenizer(geoPoint, " "); String strLat = st.nextToken(); String strLong = st.nextToken(); double lat = Double.parseDouble(strLat); double lang = Double.parseDouble(strLong); long roundedLat = Math.round(lat); long roundedLong = Math.round(lang); String locationKey = "(" + String.valueOf(roundedLat) + "," + String.valueOf(roundedLong) + ")"; String locationName = URLDecoder.decode(articleName, "UTF-8"); locationName = locationName.replace("_", " "); locationName = locationName + ":(" + lat + "," + lang + ")"; geoLocationKey.set(locationKey); geoLocationName.set(locationName); outputCollector.collect(geoLocationKey, geoLocationName); }// ww w . j a v a 2 s . c om }
From source file:com.hadoop.examples.geolocation.GeoLocationReducer.java
License:Apache License
public void reduce(Text geoLocationKey, Iterator<Text> geoLocationValues, OutputCollector<Text, Text> results, Reporter reporter) throws IOException { // in this case the reducer just creates a list so that the data can // used later String outputText = ""; while (geoLocationValues.hasNext()) { Text locationName = geoLocationValues.next(); outputText = outputText + locationName.toString() + " ,"; }/* w w w . j a v a 2 s.co m*/ outputKey.set(geoLocationKey.toString()); outputValue.set(outputText); results.collect(outputKey, outputValue); }
From source file:com.hadoop.oozie.examples.DemoMapper.java
License:Open Source License
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); output.collect(word, one);//from w w w.j a v a 2 s . c o m } }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
private void cloneOutput() throws IOException { List<FileStatus> listStatus = getOutputMappings(); /*/* w ww . j a va 2 s .com*/ * Initialize to empty list, in which case swap() will be a no-op. The reference is then replaced with a real list, which is * used in the subsequent iterations. */ List<Path> crushInput = emptyList(); Text srcFile = new Text(); Text crushOut = new Text(); Text prevCrushOut = new Text(); for (FileStatus partFile : listStatus) { Path path = partFile.getPath(); Reader reader = new Reader(fs, path, fs.getConf()); try { while (reader.next(srcFile, crushOut)) { if (!crushOut.equals(prevCrushOut)) { swap(crushInput, prevCrushOut.toString()); prevCrushOut.set(crushOut); crushInput = new LinkedList<Path>(); } crushInput.add(new Path(srcFile.toString())); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn("Trapped exception when closing " + path, e); } } swap(crushInput, prevCrushOut.toString()); } }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
/** * Moves the skipped files to the output directory. Called when operation in normal (non-clone) mode. *//*w ww. j a v a2 s . c om*/ private void moveOutput() throws IOException { List<FileStatus> listStatus = getOutputMappings(); Text srcFile = new Text(); Text crushOut = new Text(); Set<String> crushOutputFiles = new HashSet<String>(nBuckets); for (FileStatus partFile : listStatus) { Path path = partFile.getPath(); Reader reader = new Reader(fs, path, fs.getConf()); try { while (reader.next(srcFile, crushOut)) { crushOutputFiles.add(new Path(crushOut.toString()).toUri().getPath()); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn("Trapped exception when closing " + path, e); } } } assert crushOutputFiles.size() == nBuckets; /* * The crushoutput files will appear in a subdirectory of the output directory. The subdirectory will be the full path of the * input directory that was crushed. E.g. * * Crush input: * /user/me/input/dir1/file1 * /user/me/input/dir1/file2 * /user/me/input/dir2/file3 * /user/me/input/dir2/file4 * /user/me/input/dir3/dir4/file5 * /user/me/input/dir3/dir4/file6 * * Crush output: * /user/me/output/user/me/input/dir1/crushed_file ... * /user/me/output/user/me/input/dir2/crushed_file ... * /user/me/output/user/me/input/dir2/dir3/dir4/crushed_file ... * * We need to collapse this down to: * /user/me/output/dir1/crushed_file ... * /user/me/output/dir2/crushed_file ... * /user/me/output/dir2/dir3/dir4/crushed_file ... */ String srcDirName = fs.makeQualified(srcDir).toUri().getPath(); String destName = fs.makeQualified(dest).toUri().getPath(); String partToReplace = fs.makeQualified(outDir).toUri().getPath() + "/crush" + srcDirName; print(Verbosity.INFO, "\n\nCopying crush files to " + destName); for (String crushOutputFile : crushOutputFiles) { Path srcPath = new Path(crushOutputFile); Path destPath = new Path(destName + crushOutputFile.substring(partToReplace.length())).getParent(); rename(srcPath, destPath, null); } print(Verbosity.INFO, "\n\nMoving skipped files to " + destName); /* * Don't forget to move the files that were not crushed to the output dir so that the output dir has all the data that was in * the input dir, the difference being there are fewer files in the output dir. */ for (String name : skippedFiles) { Path srcPath = new Path(name); Path destPath = new Path(destName + name.substring(srcDirName.length())).getParent(); rename(srcPath, destPath, null); } }