Example usage for org.apache.hadoop.io Text Text

List of usage examples for org.apache.hadoop.io Text Text

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text Text.

Prototype

public Text(byte[] utf8) 

Source Link

Document

Construct from a byte array.

Usage

From source file:cn.edu.hfut.dmic.webcollector.fetcher.FetcherOutputFormat.java

@Override
public RecordWriter<Text, Writable> getRecordWriter(TaskAttemptContext tac)
        throws IOException, InterruptedException {
    Configuration conf = tac.getConfiguration();
    FileSystem fs = FileSystem.get(conf);
    String outputPath = conf.get("mapred.output.dir");

    Path fetchPath = new Path(outputPath, "fetch/info");
    Path contentPath = new Path(outputPath, "content/info");
    Path parseDataPath = new Path(outputPath, "parse/info");
    Path redirectPath = new Path(outputPath, "redirect/info");
    final SequenceFile.Writer fetchOut = new SequenceFile.Writer(fs, conf, fetchPath, Text.class,
            CrawlDatum.class);
    final SequenceFile.Writer contentOut = new SequenceFile.Writer(fs, conf, contentPath, Text.class,
            Content.class);
    final SequenceFile.Writer parseDataOut = new SequenceFile.Writer(fs, conf, parseDataPath, Text.class,
            CrawlDatum.class);
    final SequenceFile.Writer redirectOut = new SequenceFile.Writer(fs, conf, redirectPath, CrawlDatum.class,
            Text.class);

    return new RecordWriter<Text, Writable>() {

        @Override/* w  w w.ja va 2 s  .c o m*/
        public void write(Text k, Writable v) throws IOException, InterruptedException {
            if (v instanceof CrawlDatum) {
                fetchOut.append(k, v);
            } else if (v instanceof Content) {
                contentOut.append(k, v);
            } else if (v instanceof ParseData) {

                ParseData parseData = (ParseData) v;
                CrawlDatums next = parseData.next;
                for (CrawlDatum datum : next) {
                    parseDataOut.append(new Text(datum.getKey()), datum);
                }

            } else if (v instanceof Redirect) {
                Redirect redirect = (Redirect) v;
                redirectOut.append(redirect.datum, new Text(redirect.realUrl));
            }
        }

        @Override
        public void close(TaskAttemptContext tac) throws IOException, InterruptedException {
            fetchOut.close();
            contentOut.close();
            parseDataOut.close();
            redirectOut.close();
        }
    };

}

From source file:cn.edu.hfut.dmic.webcollectorcluster.fetcher.FetcherOutputFormat.java

@Override
public org.apache.hadoop.mapred.RecordWriter<Text, WebWritable> getRecordWriter(FileSystem fs, JobConf jc,
        String string, Progressable p) throws IOException {
    Configuration conf = jc;// w ww.j  av  a  2 s  . c om
    String outputPath = conf.get("mapred.output.dir");
    Path fetchPath = new Path(outputPath, "fetch/info");
    Path contentPath = new Path(outputPath, "content/info");
    Path parseDataPath = new Path(outputPath, "parse_data/info");
    Path parseTempPath = new Path(outputPath, "parse_temp/info");
    final SequenceFile.Writer fetchOut = new SequenceFile.Writer(fs, conf, fetchPath, Text.class,
            CrawlDatum.class);
    final SequenceFile.Writer contentOut = new SequenceFile.Writer(fs, conf, contentPath, Text.class,
            Content.class);
    final SequenceFile.Writer parseDataOut = new SequenceFile.Writer(fs, conf, parseDataPath, Text.class,
            ParseData.class);
    final SequenceFile.Writer parseTempOut = new SequenceFile.Writer(fs, conf, parseTempPath, Text.class,
            CrawlDatum.class);
    return new RecordWriter<Text, WebWritable>() {
        @Override
        public void write(Text key, WebWritable value) throws IOException {
            Writable w = value.get();
            if (w instanceof CrawlDatum) {
                fetchOut.append(key, w);
            } else if (w instanceof Content) {
                contentOut.append(key, w);
            } else if (w instanceof ParseData) {
                parseDataOut.append(key, w);
                ParseData parseData = (ParseData) w;
                if (parseData.getLinks() != null) {
                    for (Link link : parseData.getLinks()) {
                        CrawlDatum datum = new CrawlDatum();
                        datum.setUrl(link.getUrl());
                        datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
                        datum.setFetchTime(CrawlDatum.FETCHTIME_UNDEFINED);
                        parseTempOut.append(new Text(datum.getUrl()), datum);
                    }
                }
            }
        }

        @Override
        public void close(Reporter rprtr) throws IOException {
            fetchOut.close();
            contentOut.close();
            parseDataOut.close();
            parseTempOut.close();
        }
    };
}

From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Injector.java

public void inject(Path crawlDir, ArrayList<String> urls)
        throws IOException, InterruptedException, ClassNotFoundException, Exception {
    Path crawldb = new Path(crawlDir, "crawldb");
    Configuration config = CrawlerConfiguration.create();
    System.out.println(config.get("mapred.jar"));
    FileSystem fs = crawldb.getFileSystem(config);
    Path tempdb = new Path(crawldb, "temp");
    if (fs.exists(tempdb)) {
        fs.delete(tempdb);/*from   w  w  w . j  ava2  s .c  om*/
    }

    SequenceFile.Writer writer = new SequenceFile.Writer(fs, config, new Path(tempdb, "info.avro"), Text.class,
            CrawlDatum.class);
    for (String url : urls) {
        CrawlDatum crawldatum = new CrawlDatum();
        crawldatum.setUrl(url);
        crawldatum.setStatus(CrawlDatum.STATUS_DB_INJECTED);
        writer.append(new Text(url), crawldatum);
        System.out.println("inject:" + url);
    }
    writer.close();

    String[] args = new String[] { crawldb.toString(), tempdb.toString() };

    ToolRunner.run(CrawlerConfiguration.create(), new Merge(), args);
    Merge.install(crawldb);

    if (fs.exists(tempdb)) {
        fs.delete(tempdb);
    }

}

From source file:cn.lhfei.hadoop.ch02.MaxTemperatureMapper.java

License:Apache License

@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
        throws IOException, InterruptedException {

    String line = value.toString();
    String year = line.substring(15, 19);
    int airTemperature;

    if (line.charAt(87) == '+') { // parseInt doesn't like leading plus
        airTemperature = Integer.parseInt(line.substring(88, 92));
    } else {// www.j  av a  2 s  . c  o  m
        airTemperature = Integer.parseInt(line.substring(87, 92));
    }
    String quality = line.substring(92, 93);
    if (airTemperature != MISSING && quality.matches("[01459]")) {
        context.write(new Text(year), new IntWritable(airTemperature));
    }
}

From source file:cn.lhfei.hadoop.ch05.v2.MaxTemperatureMapper.java

License:Apache License

@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    String line = value.toString();
    String year = line.substring(15, 19);
    String temp = line.substring(87, 92);
    if (!missing(temp)) {
        int airTemperature = Integer.parseInt(temp);
        context.write(new Text(year), new IntWritable(airTemperature));
    }/*from  ww w  .  j  a va 2  s. c o  m*/
}

From source file:cn.lhfei.hadoop.ch05.v3.MaxTemperatureMapper.java

License:Apache License

@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
        throws IOException, InterruptedException {
    if (parser.isValidTemperature()) {
        context.write(new Text(parser.getYear()), new IntWritable(parser.getAirTemperature()));
    }/*from w ww . jav  a 2  s . co m*/
}

From source file:cn.lhfei.hive.udf.SimpleUDFExample.java

License:Apache License

public Text evaluate(Text input) {
    if (input == null)
        return null;
    return new Text("Hello " + input.toString());
}

From source file:cn.lhfei.hive.udf.SimpleUDFExampleTest.java

License:Apache License

@Test
public void testUDF() {
    SimpleUDFExample example = new SimpleUDFExample();
    Assert.assertEquals("Hello world", example.evaluate(new Text("world")).toString());
}

From source file:cn.lhfei.spark.hive.udf.LowerUDF.java

License:Apache License

public Text evaluate(final Text s) {
    if (s == null) {
        return null;
    }//  w  ww . j a  v  a 2 s .  c  o m
    return new Text(s.toString().toLowerCase());
}

From source file:cn.uc.hadoop.mapreduce.lib.input.FileNameLineRecordReader.java

License:Apache License

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();/* w w  w  .  j a  v  a 2 s .c  o  m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    //ADD by qiujw key??
    key = new Text(file.getName());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }

            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job,
                        this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}