List of usage examples for org.apache.hadoop.io Text Text
public Text(byte[] utf8)
From source file:br.ufrj.nce.recureco.distributedindex.indexer.IndexerReduce.java
License:Open Source License
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { StringBuilder sb = new StringBuilder(); boolean firstTime = true; while (values.hasNext()) { String newDocument = values.next().toString(); if (newDocument != null && newDocument.trim().length() > 0 && !sb.toString().contains(newDocument)) { if (!firstTime) { sb.append(","); }// w ww.j ava 2 s. com sb.append(newDocument); firstTime = false; } } output.collect(key, new Text(sb.toString())); }
From source file:brickhouse.udf.dcache.DistributedMapUDF.java
License:Apache License
private void addValues(HashMap<Object, Object> map, String mapFilename) throws IOException, SerDeException { if (!mapFilename.endsWith("crc")) { File mapFile = new File(mapFilename); if (mapFile.isDirectory()) { String[] subFiles = mapFile.list(); for (String subFile : subFiles) { LOG.info("Checking recursively " + subFile); addValues(map, mapFilename + "/" + subFile); }/*from w w w. j a v a2s . co m*/ } else { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(mapFile))); SerDe lazy = getLineSerde(); StructObjectInspector lineInsp = (StructObjectInspector) lazy.getObjectInspector(); StructField keyRef = lineInsp.getStructFieldRef("key"); StructField valueRef = lineInsp.getStructFieldRef("value"); String line; while ((line = reader.readLine()) != null) { Writable lineText = new Text(line); Object lineObj = lazy.deserialize(lineText); List<Object> objList = lineInsp.getStructFieldsDataAsList(lineObj); Object key = ((PrimitiveObjectInspector) keyRef.getFieldObjectInspector()) .getPrimitiveJavaObject(objList.get(0)); Object val = ((PrimitiveObjectInspector) valueRef.getFieldObjectInspector()) .getPrimitiveJavaObject(objList.get(1)); map.put(key, val); } } } else { LOG.info(" Ignoring CRC file " + mapFilename); } }
From source file:brickhouse.udf.sketch.Md5.java
License:Apache License
public Text evaluate(final Text s) { if (s == null) { return null; }/* www .j a v a 2s . c o m*/ try { MessageDigest md = MessageDigest.getInstance("MD5"); md.update(s.toString().getBytes()); byte[] md5hash = md.digest(); StringBuilder builder = new StringBuilder(); for (byte b : md5hash) { builder.append(Integer.toString((b & 0xff) + 0x100, 16).substring(1)); } return new Text(builder.toString()); } catch (NoSuchAlgorithmException nsae) { System.out.println("Cannot find digest algorithm"); System.exit(1); } return null; }
From source file:BU.MET.CS755.SpeciesGraphBuilderMapper.java
public void map(Text key, Text value, OutputCollector output, Reporter reporter) throws IOException { String title = null;/* w ww.j a va2 s. co m*/ String inputString; ArrayList<String> outlinks = null; //Get the DOM Builder Factory DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); try { //Get the DOM Builder DocumentBuilder builder = factory.newDocumentBuilder(); //Load and Parse the XML document //document contains the complete XML as a Tree. inputString = key.toString(); InputSource is = new InputSource(new StringReader(key.toString())); Document document = builder.parse(is); // Look for taxonavigation marker. if ((inputString.indexOf("== Taxonavigation ==") == -1) && (inputString.indexOf("==Taxonavigation==") == -1)) { return; } // Get the title node NodeList nodeList = document.getDocumentElement().getChildNodes(); NodeList theTitle = document.getElementsByTagName("title"); // Parse the species name from the title node. for (int i = 0; i < theTitle.getLength(); i++) { Node theNode = theTitle.item(i); Node nodeVal = theNode.getFirstChild(); title = nodeVal.getNodeValue(); title = title.replace(":", "_"); } // Get the sub-species list from <text> NodeList theText = document.getElementsByTagName("text"); for (int i = 0; i < theText.getLength(); i++) { Node theNode = theText.item(i); Node nodeVal = theNode.getFirstChild(); if (nodeVal != null) { outlinks = GetOutlinks(nodeVal.getNodeValue()); } } } catch (Exception e) { e.printStackTrace(); } if (title != null && title.length() > 0) { reporter.setStatus(title); } else { return; } StringBuilder builder = new StringBuilder(); if (outlinks != null) { for (String link : outlinks) { link = link.replace(" ", "_"); link = link.replace("\n", ""); builder.append(" "); builder.append(link); } } // remove any newlines if (builder.toString().contains("\n")) { builder.toString().replace("\n", ""); } output.collect(new Text(title), new Text(builder.toString())); }
From source file:BU.MET.CS755.SpeciesGraphBuilderReducer.java
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { reporter.setStatus(key.toString());/* w w w . j a v a 2s. c o m*/ String toWrite = ""; int count = 0; while (values.hasNext()) { String page = ((Text) values.next()).toString(); page.replaceAll(" ", "_"); toWrite += " " + page; count += 1; } IntWritable i = new IntWritable(count); String num = (i).toString(); toWrite = num + ":" + toWrite; output.collect(key, new Text(toWrite)); }
From source file:BU.MET.CS755.SpeciesIterMapper2.java
public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { // get the current page String data = ((Text) value).toString(); int index = data.indexOf(":"); if (index == -1) { String[] splits = value.toString().split("\t"); if (splits.length == 0) { splits = value.toString().split(" "); if (splits.length == 0) { return; }/*from w w w. jav a2s . c o m*/ } String pagetitle = splits[0].trim(); String pagerank = splits[splits.length - 1].trim(); output.collect(new Text(pagetitle), new Text("oldrank=" + pagerank)); return; } // split into title and PR (tab or variable number of blank spaces) String toParse = data.substring(0, index).trim(); String[] splits = toParse.split("\t"); if (splits.length == 0) { splits = toParse.split(" "); if (splits.length == 0) { return; } } String pagetitle = splits[0].trim(); String pagerank = splits[splits.length - 1].trim(); // parse current score double currScore = 0.0; try { currScore = Double.parseDouble(pagerank); } catch (Exception e) { currScore = 1.0; } // get number of outlinks data = data.substring(index + 1); String[] pages = data.split(" "); int numoutlinks = 0; if (pages.length == 0) { numoutlinks = 1; } else { for (String page : pages) { if (page.length() > 0) { numoutlinks = numoutlinks + 1; } } } // collect each outlink, with the dampened PR of its inlink, and its inlink Text toEmit = new Text((new Double(.85 * currScore / numoutlinks)).toString()); for (String page : pages) { if (page.length() > 0) { output.collect(new Text(page), toEmit); //output.collect(new Text(page), new Text(" " + pagetitle)); } } // collect the inlink with its damping factor, and all outlinks double selfRank = (1 - 0.85) / totalLinks; output.collect(new Text(pagetitle), new Text(Double.toString(selfRank))); output.collect(new Text(pagetitle), new Text(" " + data)); // Output the rank of the current page prepended by the string "oldrank=" // for the reducer to compare the newly calculated page rank against. output.collect(new Text(pagetitle), new Text("oldrank=" + Double.toString(currScore))); }
From source file:BU.MET.CS755.SpeciesIterReducer2.java
public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { double score = 0; String outLinks = ""; double oldScore = 0; // Counting links reporter.incrCounter(BU.MET.CS755.SpeciesIterDriver2.ITERATION_COUNTER.TOTAL_LINKS, 1L); if (iterationNumber == 1) { return;//w ww . ja v a2 s . c o m } while (values.hasNext()) { String curr = ((Text) values.next()).toString(); int colon = curr.indexOf(":"); int space = curr.indexOf(" "); int oldrank = curr.indexOf("oldrank"); if ((colon > -1)) { String presScore = curr.substring(0, colon); try { score += Double.parseDouble(presScore); oldScore = score; outLinks = curr.substring(colon + 1); continue; } catch (Exception e) { } } if (space > -1) { outLinks = curr; } else if (oldrank > -1) { oldScore = new Double(curr.substring(oldrank + 8)); } else { score += Double.parseDouble(curr); } } String toEmit; if (outLinks.length() > 0) { toEmit = (new Double(score)).toString() + ":" + outLinks; } else { toEmit = (new Double(score)).toString(); } // Output the new page rank output.collect(key, new Text(toEmit)); double delta = oldScore - score; // Check how much the new page rank has changed. If the change is less // than two decimal places, treat it as a converged value. If not, // we need to re-calculate the rank with one more iteration; inform the // driver about that by incrementing the iterations needed counter. if ((delta > 0.009) || (delta < -0.009)) { Counter myCounter2 = reporter .getCounter(BU.MET.CS755.SpeciesIterDriver2.ITERATION_COUNTER.ITERATIONS_NEEDED); if (myCounter2 != null) { reporter.incrCounter(BU.MET.CS755.SpeciesIterDriver2.ITERATION_COUNTER.ITERATIONS_NEEDED, 1L); } } }
From source file:BU.MET.CS755.SpeciesViewerMapper.java
public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { // get the current page String data = ((Text) value).toString(); int index = data.indexOf(":"); // Can happen if no outlinks if (index == -1) { index = data.length();// w w w. java2s . c o m } // split into title and PR (tab or variable number of blank spaces) String toParse = data.substring(0, index).trim(); String[] splits = toParse.split("\t"); if (splits.length == 0) { splits = toParse.split(" "); if (splits.length == 0) { return; } } String pagetitle = splits[0].trim(); String pagerank = splits[splits.length - 1].trim(); // parse score double currScore = 0.0; try { currScore = Double.parseDouble(pagerank); } catch (Exception e) { currScore = 0.0; } // collect //output.collect(new FloatWritable((float) - currScore), key); output.collect(new FloatWritable((float) -currScore * 1000), new Text(pagetitle)); }
From source file:car.Classification.java
License:Open Source License
public static Text getClassPercentages(String path, String content) { int[] classes = new int[4]; int total = 0; StringTokenizer lineTok = new StringTokenizer(content); String record = ""; while (lineTok.hasMoreTokens()) { record = lineTok.nextToken();/* w w w . j a va 2 s . com*/ // Iterates over the String content, incrementing the corresponding class counters. classes[getClassIndex(record.substring(record.lastIndexOf(',') + 1))] += 1; } total = classes[0] + classes[1] + classes[2] + classes[3]; return new Text(path + "-class=" + CLASS_UNACC + ":" + classes[0] * 100 / total + "%, class=" + CLASS_ACC + ":" + classes[1] * 100 / total + "%, class=" + CLASS_GOOD + ":" + classes[2] * 100 / total + "%, class=" + CLASS_VGOOD + ":" + classes[3] * 100 / total + "%"); }
From source file:cascading.hive.ORCFile.java
License:Apache License
private void tuple2Struct(Tuple tuple, OrcStruct struct) { Object value = null;/*from ww w. j a v a 2 s . c o m*/ for (int i = 0; i < types.length; i++) { switch (typeMapping.get(types[i].toLowerCase())) { case INT: value = tuple.getObject(i) == null ? null : new IntWritable(tuple.getInteger(i)); break; case BOOLEAN: value = tuple.getObject(i) == null ? null : new BooleanWritable(tuple.getBoolean(i)); break; case TINYINT: value = tuple.getObject(i) == null ? null : new ByteWritable(Byte.valueOf(tuple.getString(i))); break; case SMALLINT: value = tuple.getObject(i) == null ? null : new ShortWritable(tuple.getShort(i)); break; case BIGINT: value = tuple.getObject(i) == null ? null : new LongWritable(tuple.getLong(i)); break; case FLOAT: value = tuple.getObject(i) == null ? null : new FloatWritable(tuple.getFloat(i)); break; case DOUBLE: value = tuple.getObject(i) == null ? null : new DoubleWritable(tuple.getDouble(i)); break; case BIGDECIMAL: value = tuple.getObject(i) == null ? null : new HiveDecimalWritable(HiveDecimal.create((new BigDecimal(tuple.getString(i))))); break; case STRING: default: value = tuple.getObject(i) == null ? null : new Text(tuple.getString(i)); } struct.setFieldValue(i, value); } }