Example usage for org.apache.hadoop.io Text toString

List of usage examples for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString() 

Source Link

Document

Convert text back to string

Usage

From source file:com.alectenharmsel.research.SrcTokMapper.java

License:Apache License

public void map(LongWritable key, Text contents, Mapper.Context context)
        throws IOException, InterruptedException {
    StringBuilder line = new StringBuilder(contents.toString());
    for (int i = 0; i < line.length(); i++) {
        if (!Character.isLetter(line.charAt(i))) {
            line.replace(i, i + 1, " ");
        }/*from   ww  w  . j  a va 2 s .c om*/
    }
    String[] tokens = line.toString().split(" ");
    for (String s : tokens) {
        if (s.length() > 0) {
            context.write(new Text(s), one);
        }
    }
}

From source file:com.alexholmes.hadooputils.sort.SortRecordReader.java

License:Apache License

/**
 * Extract the key from the sort line, using the spupplied options.
 *
 * @param value          the sort line/*from  w  ww  . j a v  a  2 s .co m*/
 * @param startKey       the start key, or null if there isn't one
 * @param endKey         the end key, or null if there isn't one
 * @param fieldSeparator the field separator, used if a start (and optionally end) key are set
 * @param ignoreCase     whether the result should be lower-cased to ensure case is ignored
 * @return the key
 * @throws IOException if something goes wrong
 */
protected static Text extractKey(final Text value, final Integer startKey, final Integer endKey,
        final String fieldSeparator, final boolean ignoreCase) throws IOException {

    Text result = new Text();

    if (startKey == null) {
        result.set(value);
    } else {

        // startKey is 1-based in the Linux sort, so decrement them to be 0-based
        //
        int startIdx = startKey - 1;

        String[] parts = StringUtils.split(value.toString(), fieldSeparator);

        if (startIdx >= parts.length) {
            throw new IOException("Start index is greater than parts in line");
        }

        int endIdx = parts.length;

        if (endKey != null) {
            // endKey is also 1-based in the Linux sort, but the StringUtils.join
            // end index is exclusive, so no need to decrement
            //
            endIdx = endKey;
            if (endIdx > parts.length) {
                throw new IOException("End index is greater than parts in line");
            }
        }

        result.set(StringUtils.join(parts, fieldSeparator, startIdx, endIdx));
    }

    if (ignoreCase) {
        result.set(result.toString().toLowerCase());
    }

    return result;
}

From source file:com.alimama.quanjingmonitor.kmeans.KMeansClusterCombiner.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
    Cluster clu = clusterMap.get(key.toString());
    int limit = 1;
    if (clu != null) {
        limit = Math.min(clu.getNumselect() * this.rep * 10, 50000);

    }/*from w  ww . j a  v  a 2 s .  com*/
    if (limit < 1000) {
        limit = 1000;
    }

    PriorityQueue<Text> res = new PriorityQueue<Text>(limit, Collections.reverseOrder(cmp));
    for (Text value : values) {

        if (res.size() < limit) {
            res.add(new Text(value.toString()));
        } else if (cmp.compare(res.peek(), new Text(value.toString())) > 0) {
            res.add(new Text(value.toString()));
            res.poll();
        }

    }
    for (Text s : res) {
        context.write(key, s);

    }
}

From source file:com.alimama.quanjingmonitor.kmeans.KMeansClusterMapper.java

License:Apache License

@Override
protected void map(WritableComparable<?> key, Text point, Context context)
        throws IOException, InterruptedException {

    Cluster nearestCluster = null;/* ww w  . jav  a  2 s. c om*/
    double nearestDistance = Integer.MAX_VALUE;
    Vector pointv = parse.parseVector(point.toString());
    if (pointv == null) {
        return;
    }
    pointv.setNumPoints(1);
    for (Cluster cluster : clusters) {
        Vector clusterCenter = cluster.getCenter();
        boolean isDeny = clusterCenter.Deny(pointv);
        if (isDeny) {
            continue;
        }
        double distance = clusterCenter.distiance(pointv);
        context.getCounter("Clustering", "similar").increment(1);

        if (distance <= nearestDistance || nearestCluster == null) {
            nearestCluster = cluster;
            nearestDistance = distance;
        }
    }
    if (nearestCluster != null) {
        context.write(new Text(String.valueOf(nearestCluster.getId())),
                new Text(String.valueOf(nearestDistance) + "@abtest@" + point.toString()));
    }

}

From source file:com.alimama.quanjingmonitor.kmeans.KMeansClusterReduce.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
    Cluster clu = clusterMap.get(key.toString());
    int numberSelect = 1;
    int limit = 1;
    if (clu != null) {
        limit = Math.min(clu.getNumselect() * this.rep * 100, 100000);
        numberSelect = clu.getNumselect();
        System.out.println("key:" + key + "," + numberSelect);

    } else {/*  www  . j ava  2 s. co m*/
        System.out.println("can nott found key:" + key);
    }

    if (limit < 5000) {
        limit = 5000;
    }

    PriorityQueue<String> res = new PriorityQueue<String>(limit, Collections.reverseOrder(cmp));
    for (Text value : values) {

        if (res.size() < limit) {
            res.add(value.toString());
        } else if (cmp.compare(res.peek(), value.toString()) > 0) {
            res.add(value.toString());
            res.poll();
        }
    }

    ArrayList<String> list = new ArrayList<String>(res);
    Collections.sort(list, cmp);

    comPair[] writelist = new comPair[numberSelect];
    int end = list.size();

    ArrayList<String> left = new ArrayList<String>(res);

    for (int i = 0; i < end; i++) {
        String s = list.get(i);
        System.out.println("111>>" + s);
        String[] cols = s.split("@abtest@");
        String line = cols[1];
        Vector group = parse.parseVector(line);
        for (int j = 0; j < writelist.length; j++) {
            if (writelist[j] == null) {
                comPair p = new comPair();
                p.s1 = s;
                p.v1 = group;
                writelist[j] = p;
                s = null;
                break;
            }

            boolean deny = writelist[j].v1.Deny(group);
            double dis = writelist[j].v1.distiance(group);
            System.out.println("222>>" + dis);

            if (!deny && writelist[j].distance > dis) {
                writelist[j].distance = dis;
                String s_tmp = writelist[j].s2;
                Vector group_tmp = writelist[j].v2;
                writelist[j].s2 = s;
                writelist[j].v2 = group;
                s = s_tmp;
                group = group_tmp;
                if (s_tmp == null) {
                    break;
                }

            }
        }

        if (s != null) {
            left.add(s);
        }
    }

    int end2 = left.size();

    for (int i = 0; i < end2; i++) {
        String s = left.get(i);
        String[] cols = s.split("@abtest@");
        String line = cols[1];
        Vector group = parse.parseVector(line);
        boolean isset = false;
        for (int j = 0; j < writelist.length; j++) {
            if (writelist[j] == null || writelist[j].s2 != null) {
                continue;
            }

            double dis = writelist[j].v1.distiance(group);
            if (writelist[j].distance > dis) {
                System.out.println("333>>" + s);
                isset = true;
                writelist[j].distance = dis;
                String s_tmp = writelist[j].s2;
                Vector group_tmp = writelist[j].v2;
                writelist[j].s2 = s;
                writelist[j].v2 = group;
                if (s_tmp == null) {
                    break;
                }
                s = s_tmp;
                group = group_tmp;
            }
        }

        if (!isset) {
            break;
        }
    }

    for (int i = 0; i < writelist.length; i++) {
        if (writelist[i] != null && writelist[i].s2 != null) {
            int rrr = (int) ((Math.random() * 10000) % 2);
            int rrr2 = (rrr + 1) % 2;
            System.out.println(writelist[i].toString());
            context.write(key,
                    new Text(writelist[i].distance + "\t" + i + "\trep" + rrr + "_1\t" + writelist[i].s1));
            context.write(key,
                    new Text(writelist[i].distance + "\t" + i + "\trep" + rrr2 + "_2\t" + writelist[i].s2));
        }
    }

}

From source file:com.alimama.quanjingmonitor.kmeans.KMeansGroupMapper.java

License:Apache License

@Override
protected void map(WritableComparable<?> key, Text point, Context context)
        throws IOException, InterruptedException {

    String line = point.toString();
    Vector pointv = parse.parseVector(line);
    pointv.setNumPoints(1);//from   w  w  w  .  j  a v a 2  s.c  om

    Cluster clu = new Cluster(pointv, 0);
    context.write(new Text(String.valueOf(parse.parseKey(line))), clu);

}

From source file:com.alimama.quanjingmonitor.kmeans.KMeansMapper.java

License:Apache License

@Override
protected void map(WritableComparable<?> key, Text point, Context context)
        throws IOException, InterruptedException {

    Cluster nearestCluster = null;//from   ww w  .  j a va2s . com
    double nearestDistance = Double.MAX_VALUE;
    Vector pointv = parse.parseVector(point.toString());
    if (pointv == null) {
        return;
    }
    pointv.setNumPoints(1);
    for (Cluster cluster : clusters) {
        Vector clusterCenter = cluster.getCenter();

        boolean isDeny = pointv.Deny(clusterCenter);
        if (isDeny) {
            continue;
        }
        double distance = clusterCenter.distiance(pointv);
        context.getCounter("Clustering", "similar").increment(1);

        if (distance <= nearestDistance || nearestCluster == null) {
            nearestCluster = cluster;
            nearestDistance = distance;
        }
    }
    if (nearestCluster != null) {
        context.write(new Text(String.valueOf(nearestCluster.getId())), pointv);
    }

}

From source file:com.asakusafw.dag.compiler.internalio.InternalOutputAdapterGeneratorTest.java

License:Apache License

private void check(String... values) {
    Path path = new Path(new File(temporary.getRoot(), "part-*").toURI());
    Configuration conf = configurations.newInstance();

    ClassGeneratorContext gc = context();
    Spec spec = new Spec("o", path.toString(), Descriptions.typeOf(Text.class));
    ClassDescription gen = add(c -> new InternalOutputAdapterGenerator().generate(gc, Arrays.asList(spec), c));
    loading(gen, c -> {/*  www. jav  a2  s.c o m*/
        VertexProcessorContext vc = new MockVertexProcessorContext().with(c).withResource(conf)
                .withResource(new StageInfo("u", "b", "f", "s", "e", Collections.emptyMap()));
        try (OutputAdapter adapter = adapter(c, vc)) {
            adapter.initialize();
            OutputHandler<? super TaskProcessorContext> handler = adapter.newHandler();
            Result<Text> sink = handler.getSink(Text.class, "o");
            try (Session session = handler.start(new MockTaskProcessorContext("t"))) {
                for (String v : values) {
                    sink.add(new Text(v));
                }
            }
        }
    });

    Set<String> results = new LinkedHashSet<>();
    try {
        List<Path> paths = TemporaryStorage.list(conf, path);
        Text buf = new Text();
        for (Path p : paths) {
            try (ModelInput<Text> in = TemporaryStorage.openInput(conf, Text.class, p)) {
                while (in.readTo(buf)) {
                    results.add(buf.toString());
                }
            }
        }
    } catch (IOException e) {
        throw new AssertionError(e);
    }
    assertThat(results, containsInAnyOrder(values));
}

From source file:com.asakusafw.dag.compiler.internalio.InternalOutputPrepareGeneratorTest.java

License:Apache License

private static List<String> collect(File folder) throws IOException {
    List<String> results = new ArrayList<>();
    for (File file : folder.listFiles(f -> f.getName().startsWith("part-"))) {
        try (ModelInput<Text> in = LocalInternalInputTaskInfo.open(file)) {
            Text buf = new Text();
            while (in.readTo(buf)) {
                results.add(buf.toString());
            }/*from  ww  w. j ava  2 s.  c om*/
        }
    }
    return results;
}

From source file:com.asakusafw.dag.runtime.internalio.HadoopInternalInputTaskInfoTest.java

License:Apache License

/**
 * simple case.//from w w  w  .j a va  2s.  c  om
 * @throws Exception if failed
 */
@Test
public void simple() throws Exception {
    Configuration conf = new Configuration();
    Path file = new Path(temporary.newFile().toURI());
    FileSystem fileSystem = file.getFileSystem(conf);
    put(fileSystem, file, "Hello, world!");

    List<String> results = new ArrayList<>();
    HadoopInternalInputTaskInfo<Text> info = new HadoopInternalInputTaskInfo<>(fileSystem, file, 0, 0,
            Text::new);
    try (ModelInput<Text> in = info.open()) {
        Text buf = info.newDataObject();
        while (in.readTo(buf)) {
            results.add(buf.toString());
        }
    }
    assertThat(results, containsInAnyOrder("Hello, world!"));
}