Example usage for org.apache.hadoop.io Text toString

List of usage examples for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString() 

Source Link

Document

Convert text back to string

Usage

From source file:be.uantwerpen.adrem.eclat.EclatMinerReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    long numberOfSets = Long.parseLong(key.toString());
    for (Text item : values) {
        setsFound += numberOfSets;//from w w  w  . j  a v a2s.  c  om
        context.write(key, item);
    }
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.Bowtie2Mapper.java

License:Open Source License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    super.map(key, value, context);
    ((Bowtie2Instance) instance).feedLine(value.toString(), (readcount % 2 + 1));
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.BWAAlnMapper.java

License:Open Source License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    super.map(key, value, context);
    ((BWAAlnInstance) instance).feedLine(value.toString(), (readcount % 2 + 1));
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.BWAMemMapper.java

License:Open Source License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    super.map(key, value, context);
    ((BWAMemInstance) instance).feedLine(value.toString());
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.Cushaw2Mapper.java

License:Open Source License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    super.map(key, value, context);
    ((Cushaw2Instance) instance).feedLine(value.toString(), (readcount % 2 + 1));
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.HTSeqCombineMapper.java

@Override
protected void map(LongWritable key, Text value, Mapper.Context context)
        throws IOException, InterruptedException {
    String[] split = value.toString().split("\t");
    try {/*from w  w  w .  jav a  2 s.com*/
        k.set(split[0] + "\t" + split[1] + "\t" + split[2] + "\t" + split[3] + "\t" + split[4]); // gene_id contig start end strand
        v.set(Integer.parseInt(split[split.length - 1]));
        context.write(k, v);
    } catch (ArrayIndexOutOfBoundsException | NumberFormatException ex) { // ignore header lines!
        Logger.DEBUG("invalid line ignored; " + value.toString());
    }
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.StarAlignPassXMapper.java

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    super.map(key, value, context);
    ((STARInstance) instance).feedLine(value.toString(), count, readcount % 2);
}

From source file:bixo.examples.crawl.MultiDomainUrlFilter.java

License:Apache License

public MultiDomainUrlFilter(Path filterFile) throws Exception {
    //we could require a filter file and put these in all urls or leave them here
    _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$");
    _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://");

    JobConf conf = HadoopUtils.getDefaultJobConf();
    try {//process the file passed in
        if (filterFile != null) {
            FileSystem fs = filterFile.getFileSystem(conf);
            if (fs.exists(filterFile)) {
                FSDataInputStream in = fs.open(filterFile);
                LineReader lr = new LineReader(in);
                Text tmpStr = new Text();
                while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines
                    String p = tmpStr.toString().trim();//remove whitespace
                    if (p.substring(0, 1).equals("+")) {// '+' means do-crawl
                        ArrayList filterPair = new ArrayList();
                        filterPair.add((Boolean) true);
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } else if (p.substring(0, 1).equals("-")) {// '-' means filter out
                        ArrayList filterPair = new ArrayList();
                        filterPair.add(new Boolean(false));
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } // otherwise a comment or malformed filter pattern
                }//w w w.java  2 s. c o  m
            }
        }

    } catch (Exception e) {
        //any cleanup here? This would indicate a file system error, most likely
        throw e;
    }
}

From source file:bixo.examples.crawl.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file system
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    LOGGER.info("Looking for file: " + urlFiltersFile);
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }/*ww w  .  ja  va2s. com*/
        }
        in.close();
    } else {
        LOGGER.info("Can't find file: " + urlFiltersFile);
    }
    return filterList;
}

From source file:boa.datagen.SeqProjectCombiner.java

License:Apache License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    conf.set("fs.default.name", "hdfs://boa-njt/");
    FileSystem fileSystem = FileSystem.get(conf);
    String base = conf.get("fs.default.name", "");

    HashMap<String, String> sources = new HashMap<String, String>();
    HashSet<String> marks = new HashSet<String>();
    FileStatus[] files = fileSystem.listStatus(new Path(base + "tmprepcache/2015-07"));
    for (int i = 0; i < files.length; i++) {
        FileStatus file = files[i];/*ww  w  .  ja va  2s  . co  m*/
        String name = file.getPath().getName();
        if (name.startsWith("projects-") && name.endsWith(".seq")) {
            System.out.println("Reading file " + i + " in " + files.length + ": " + name);
            SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, file.getPath(), conf);
            final Text key = new Text();
            final BytesWritable value = new BytesWritable();
            try {
                while (r.next(key, value)) {
                    String s = key.toString();
                    if (marks.contains(s))
                        continue;
                    Project p = Project
                            .parseFrom(CodedInputStream.newInstance(value.getBytes(), 0, value.getLength()));
                    if (p.getCodeRepositoriesCount() > 0 && p.getCodeRepositories(0).getRevisionsCount() > 0)
                        marks.add(s);
                    sources.put(s, name);
                }
            } catch (Exception e) {
                System.err.println(name);
                e.printStackTrace();
            }
            r.close();
        }
    }
    SequenceFile.Writer w = SequenceFile.createWriter(fileSystem, conf,
            new Path(base + "repcache/2015-07/projects.seq"), Text.class, BytesWritable.class);
    for (int i = 0; i < files.length; i++) {
        FileStatus file = files[i];
        String name = file.getPath().getName();
        if (name.startsWith("projects-") && name.endsWith(".seq")) {
            System.out.println("Reading file " + i + " in " + files.length + ": " + name);
            SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, file.getPath(), conf);
            final Text key = new Text();
            final BytesWritable value = new BytesWritable();
            try {
                while (r.next(key, value)) {
                    String s = key.toString();
                    if (sources.get(s).equals(name))
                        w.append(key, value);
                }
            } catch (Exception e) {
                System.err.println(name);
                e.printStackTrace();
            }
            r.close();
        }
    }
    w.close();

    fileSystem.close();
}