List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:com.alectenharmsel.research.SrcTokMapper.java
License:Apache License
public void map(LongWritable key, Text contents, Mapper.Context context) throws IOException, InterruptedException { StringBuilder line = new StringBuilder(contents.toString()); for (int i = 0; i < line.length(); i++) { if (!Character.isLetter(line.charAt(i))) { line.replace(i, i + 1, " "); }/*from ww w . j a va 2 s .c om*/ } String[] tokens = line.toString().split(" "); for (String s : tokens) { if (s.length() > 0) { context.write(new Text(s), one); } } }
From source file:com.alexholmes.hadooputils.sort.SortRecordReader.java
License:Apache License
/** * Extract the key from the sort line, using the spupplied options. * * @param value the sort line/*from w ww . j a v a 2 s .co m*/ * @param startKey the start key, or null if there isn't one * @param endKey the end key, or null if there isn't one * @param fieldSeparator the field separator, used if a start (and optionally end) key are set * @param ignoreCase whether the result should be lower-cased to ensure case is ignored * @return the key * @throws IOException if something goes wrong */ protected static Text extractKey(final Text value, final Integer startKey, final Integer endKey, final String fieldSeparator, final boolean ignoreCase) throws IOException { Text result = new Text(); if (startKey == null) { result.set(value); } else { // startKey is 1-based in the Linux sort, so decrement them to be 0-based // int startIdx = startKey - 1; String[] parts = StringUtils.split(value.toString(), fieldSeparator); if (startIdx >= parts.length) { throw new IOException("Start index is greater than parts in line"); } int endIdx = parts.length; if (endKey != null) { // endKey is also 1-based in the Linux sort, but the StringUtils.join // end index is exclusive, so no need to decrement // endIdx = endKey; if (endIdx > parts.length) { throw new IOException("End index is greater than parts in line"); } } result.set(StringUtils.join(parts, fieldSeparator, startIdx, endIdx)); } if (ignoreCase) { result.set(result.toString().toLowerCase()); } return result; }
From source file:com.alimama.quanjingmonitor.kmeans.KMeansClusterCombiner.java
License:Apache License
@Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Cluster clu = clusterMap.get(key.toString()); int limit = 1; if (clu != null) { limit = Math.min(clu.getNumselect() * this.rep * 10, 50000); }/*from w ww . j a v a 2 s . com*/ if (limit < 1000) { limit = 1000; } PriorityQueue<Text> res = new PriorityQueue<Text>(limit, Collections.reverseOrder(cmp)); for (Text value : values) { if (res.size() < limit) { res.add(new Text(value.toString())); } else if (cmp.compare(res.peek(), new Text(value.toString())) > 0) { res.add(new Text(value.toString())); res.poll(); } } for (Text s : res) { context.write(key, s); } }
From source file:com.alimama.quanjingmonitor.kmeans.KMeansClusterMapper.java
License:Apache License
@Override protected void map(WritableComparable<?> key, Text point, Context context) throws IOException, InterruptedException { Cluster nearestCluster = null;/* ww w . jav a 2 s. c om*/ double nearestDistance = Integer.MAX_VALUE; Vector pointv = parse.parseVector(point.toString()); if (pointv == null) { return; } pointv.setNumPoints(1); for (Cluster cluster : clusters) { Vector clusterCenter = cluster.getCenter(); boolean isDeny = clusterCenter.Deny(pointv); if (isDeny) { continue; } double distance = clusterCenter.distiance(pointv); context.getCounter("Clustering", "similar").increment(1); if (distance <= nearestDistance || nearestCluster == null) { nearestCluster = cluster; nearestDistance = distance; } } if (nearestCluster != null) { context.write(new Text(String.valueOf(nearestCluster.getId())), new Text(String.valueOf(nearestDistance) + "@abtest@" + point.toString())); } }
From source file:com.alimama.quanjingmonitor.kmeans.KMeansClusterReduce.java
License:Apache License
@Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Cluster clu = clusterMap.get(key.toString()); int numberSelect = 1; int limit = 1; if (clu != null) { limit = Math.min(clu.getNumselect() * this.rep * 100, 100000); numberSelect = clu.getNumselect(); System.out.println("key:" + key + "," + numberSelect); } else {/* www . j ava 2 s. co m*/ System.out.println("can nott found key:" + key); } if (limit < 5000) { limit = 5000; } PriorityQueue<String> res = new PriorityQueue<String>(limit, Collections.reverseOrder(cmp)); for (Text value : values) { if (res.size() < limit) { res.add(value.toString()); } else if (cmp.compare(res.peek(), value.toString()) > 0) { res.add(value.toString()); res.poll(); } } ArrayList<String> list = new ArrayList<String>(res); Collections.sort(list, cmp); comPair[] writelist = new comPair[numberSelect]; int end = list.size(); ArrayList<String> left = new ArrayList<String>(res); for (int i = 0; i < end; i++) { String s = list.get(i); System.out.println("111>>" + s); String[] cols = s.split("@abtest@"); String line = cols[1]; Vector group = parse.parseVector(line); for (int j = 0; j < writelist.length; j++) { if (writelist[j] == null) { comPair p = new comPair(); p.s1 = s; p.v1 = group; writelist[j] = p; s = null; break; } boolean deny = writelist[j].v1.Deny(group); double dis = writelist[j].v1.distiance(group); System.out.println("222>>" + dis); if (!deny && writelist[j].distance > dis) { writelist[j].distance = dis; String s_tmp = writelist[j].s2; Vector group_tmp = writelist[j].v2; writelist[j].s2 = s; writelist[j].v2 = group; s = s_tmp; group = group_tmp; if (s_tmp == null) { break; } } } if (s != null) { left.add(s); } } int end2 = left.size(); for (int i = 0; i < end2; i++) { String s = left.get(i); String[] cols = s.split("@abtest@"); String line = cols[1]; Vector group = parse.parseVector(line); boolean isset = false; for (int j = 0; j < writelist.length; j++) { if (writelist[j] == null || writelist[j].s2 != null) { continue; } double dis = writelist[j].v1.distiance(group); if (writelist[j].distance > dis) { System.out.println("333>>" + s); isset = true; writelist[j].distance = dis; String s_tmp = writelist[j].s2; Vector group_tmp = writelist[j].v2; writelist[j].s2 = s; writelist[j].v2 = group; if (s_tmp == null) { break; } s = s_tmp; group = group_tmp; } } if (!isset) { break; } } for (int i = 0; i < writelist.length; i++) { if (writelist[i] != null && writelist[i].s2 != null) { int rrr = (int) ((Math.random() * 10000) % 2); int rrr2 = (rrr + 1) % 2; System.out.println(writelist[i].toString()); context.write(key, new Text(writelist[i].distance + "\t" + i + "\trep" + rrr + "_1\t" + writelist[i].s1)); context.write(key, new Text(writelist[i].distance + "\t" + i + "\trep" + rrr2 + "_2\t" + writelist[i].s2)); } } }
From source file:com.alimama.quanjingmonitor.kmeans.KMeansGroupMapper.java
License:Apache License
@Override protected void map(WritableComparable<?> key, Text point, Context context) throws IOException, InterruptedException { String line = point.toString(); Vector pointv = parse.parseVector(line); pointv.setNumPoints(1);//from w w w . j a v a 2 s.c om Cluster clu = new Cluster(pointv, 0); context.write(new Text(String.valueOf(parse.parseKey(line))), clu); }
From source file:com.alimama.quanjingmonitor.kmeans.KMeansMapper.java
License:Apache License
@Override protected void map(WritableComparable<?> key, Text point, Context context) throws IOException, InterruptedException { Cluster nearestCluster = null;//from ww w . j a va2s . com double nearestDistance = Double.MAX_VALUE; Vector pointv = parse.parseVector(point.toString()); if (pointv == null) { return; } pointv.setNumPoints(1); for (Cluster cluster : clusters) { Vector clusterCenter = cluster.getCenter(); boolean isDeny = pointv.Deny(clusterCenter); if (isDeny) { continue; } double distance = clusterCenter.distiance(pointv); context.getCounter("Clustering", "similar").increment(1); if (distance <= nearestDistance || nearestCluster == null) { nearestCluster = cluster; nearestDistance = distance; } } if (nearestCluster != null) { context.write(new Text(String.valueOf(nearestCluster.getId())), pointv); } }
From source file:com.asakusafw.dag.compiler.internalio.InternalOutputAdapterGeneratorTest.java
License:Apache License
private void check(String... values) { Path path = new Path(new File(temporary.getRoot(), "part-*").toURI()); Configuration conf = configurations.newInstance(); ClassGeneratorContext gc = context(); Spec spec = new Spec("o", path.toString(), Descriptions.typeOf(Text.class)); ClassDescription gen = add(c -> new InternalOutputAdapterGenerator().generate(gc, Arrays.asList(spec), c)); loading(gen, c -> {/* www. jav a2 s.c o m*/ VertexProcessorContext vc = new MockVertexProcessorContext().with(c).withResource(conf) .withResource(new StageInfo("u", "b", "f", "s", "e", Collections.emptyMap())); try (OutputAdapter adapter = adapter(c, vc)) { adapter.initialize(); OutputHandler<? super TaskProcessorContext> handler = adapter.newHandler(); Result<Text> sink = handler.getSink(Text.class, "o"); try (Session session = handler.start(new MockTaskProcessorContext("t"))) { for (String v : values) { sink.add(new Text(v)); } } } }); Set<String> results = new LinkedHashSet<>(); try { List<Path> paths = TemporaryStorage.list(conf, path); Text buf = new Text(); for (Path p : paths) { try (ModelInput<Text> in = TemporaryStorage.openInput(conf, Text.class, p)) { while (in.readTo(buf)) { results.add(buf.toString()); } } } } catch (IOException e) { throw new AssertionError(e); } assertThat(results, containsInAnyOrder(values)); }
From source file:com.asakusafw.dag.compiler.internalio.InternalOutputPrepareGeneratorTest.java
License:Apache License
private static List<String> collect(File folder) throws IOException { List<String> results = new ArrayList<>(); for (File file : folder.listFiles(f -> f.getName().startsWith("part-"))) { try (ModelInput<Text> in = LocalInternalInputTaskInfo.open(file)) { Text buf = new Text(); while (in.readTo(buf)) { results.add(buf.toString()); }/*from ww w. j ava 2 s. c om*/ } } return results; }
From source file:com.asakusafw.dag.runtime.internalio.HadoopInternalInputTaskInfoTest.java
License:Apache License
/** * simple case.//from w w w .j a va 2s. c om * @throws Exception if failed */ @Test public void simple() throws Exception { Configuration conf = new Configuration(); Path file = new Path(temporary.newFile().toURI()); FileSystem fileSystem = file.getFileSystem(conf); put(fileSystem, file, "Hello, world!"); List<String> results = new ArrayList<>(); HadoopInternalInputTaskInfo<Text> info = new HadoopInternalInputTaskInfo<>(fileSystem, file, 0, 0, Text::new); try (ModelInput<Text> in = info.open()) { Text buf = info.newDataObject(); while (in.readTo(buf)) { results.add(buf.toString()); } } assertThat(results, containsInAnyOrder("Hello, world!")); }