List of usage examples for org.apache.hadoop.io Text Text
public Text(byte[] utf8)
From source file:cn.edu.hfut.dmic.webcollector.fetcher.FetcherOutputFormat.java
@Override public RecordWriter<Text, Writable> getRecordWriter(TaskAttemptContext tac) throws IOException, InterruptedException { Configuration conf = tac.getConfiguration(); FileSystem fs = FileSystem.get(conf); String outputPath = conf.get("mapred.output.dir"); Path fetchPath = new Path(outputPath, "fetch/info"); Path contentPath = new Path(outputPath, "content/info"); Path parseDataPath = new Path(outputPath, "parse/info"); Path redirectPath = new Path(outputPath, "redirect/info"); final SequenceFile.Writer fetchOut = new SequenceFile.Writer(fs, conf, fetchPath, Text.class, CrawlDatum.class); final SequenceFile.Writer contentOut = new SequenceFile.Writer(fs, conf, contentPath, Text.class, Content.class); final SequenceFile.Writer parseDataOut = new SequenceFile.Writer(fs, conf, parseDataPath, Text.class, CrawlDatum.class); final SequenceFile.Writer redirectOut = new SequenceFile.Writer(fs, conf, redirectPath, CrawlDatum.class, Text.class); return new RecordWriter<Text, Writable>() { @Override/* w w w.ja va 2 s .c o m*/ public void write(Text k, Writable v) throws IOException, InterruptedException { if (v instanceof CrawlDatum) { fetchOut.append(k, v); } else if (v instanceof Content) { contentOut.append(k, v); } else if (v instanceof ParseData) { ParseData parseData = (ParseData) v; CrawlDatums next = parseData.next; for (CrawlDatum datum : next) { parseDataOut.append(new Text(datum.getKey()), datum); } } else if (v instanceof Redirect) { Redirect redirect = (Redirect) v; redirectOut.append(redirect.datum, new Text(redirect.realUrl)); } } @Override public void close(TaskAttemptContext tac) throws IOException, InterruptedException { fetchOut.close(); contentOut.close(); parseDataOut.close(); redirectOut.close(); } }; }
From source file:cn.edu.hfut.dmic.webcollectorcluster.fetcher.FetcherOutputFormat.java
@Override public org.apache.hadoop.mapred.RecordWriter<Text, WebWritable> getRecordWriter(FileSystem fs, JobConf jc, String string, Progressable p) throws IOException { Configuration conf = jc;// w ww.j av a 2 s . c om String outputPath = conf.get("mapred.output.dir"); Path fetchPath = new Path(outputPath, "fetch/info"); Path contentPath = new Path(outputPath, "content/info"); Path parseDataPath = new Path(outputPath, "parse_data/info"); Path parseTempPath = new Path(outputPath, "parse_temp/info"); final SequenceFile.Writer fetchOut = new SequenceFile.Writer(fs, conf, fetchPath, Text.class, CrawlDatum.class); final SequenceFile.Writer contentOut = new SequenceFile.Writer(fs, conf, contentPath, Text.class, Content.class); final SequenceFile.Writer parseDataOut = new SequenceFile.Writer(fs, conf, parseDataPath, Text.class, ParseData.class); final SequenceFile.Writer parseTempOut = new SequenceFile.Writer(fs, conf, parseTempPath, Text.class, CrawlDatum.class); return new RecordWriter<Text, WebWritable>() { @Override public void write(Text key, WebWritable value) throws IOException { Writable w = value.get(); if (w instanceof CrawlDatum) { fetchOut.append(key, w); } else if (w instanceof Content) { contentOut.append(key, w); } else if (w instanceof ParseData) { parseDataOut.append(key, w); ParseData parseData = (ParseData) w; if (parseData.getLinks() != null) { for (Link link : parseData.getLinks()) { CrawlDatum datum = new CrawlDatum(); datum.setUrl(link.getUrl()); datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); datum.setFetchTime(CrawlDatum.FETCHTIME_UNDEFINED); parseTempOut.append(new Text(datum.getUrl()), datum); } } } } @Override public void close(Reporter rprtr) throws IOException { fetchOut.close(); contentOut.close(); parseDataOut.close(); parseTempOut.close(); } }; }
From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.Injector.java
public void inject(Path crawlDir, ArrayList<String> urls) throws IOException, InterruptedException, ClassNotFoundException, Exception { Path crawldb = new Path(crawlDir, "crawldb"); Configuration config = CrawlerConfiguration.create(); System.out.println(config.get("mapred.jar")); FileSystem fs = crawldb.getFileSystem(config); Path tempdb = new Path(crawldb, "temp"); if (fs.exists(tempdb)) { fs.delete(tempdb);/*from w w w . j ava2 s .c om*/ } SequenceFile.Writer writer = new SequenceFile.Writer(fs, config, new Path(tempdb, "info.avro"), Text.class, CrawlDatum.class); for (String url : urls) { CrawlDatum crawldatum = new CrawlDatum(); crawldatum.setUrl(url); crawldatum.setStatus(CrawlDatum.STATUS_DB_INJECTED); writer.append(new Text(url), crawldatum); System.out.println("inject:" + url); } writer.close(); String[] args = new String[] { crawldb.toString(), tempdb.toString() }; ToolRunner.run(CrawlerConfiguration.create(), new Merge(), args); Merge.install(crawldb); if (fs.exists(tempdb)) { fs.delete(tempdb); } }
From source file:cn.lhfei.hadoop.ch02.MaxTemperatureMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException { String line = value.toString(); String year = line.substring(15, 19); int airTemperature; if (line.charAt(87) == '+') { // parseInt doesn't like leading plus airTemperature = Integer.parseInt(line.substring(88, 92)); } else {// www.j av a 2 s . c o m airTemperature = Integer.parseInt(line.substring(87, 92)); } String quality = line.substring(92, 93); if (airTemperature != MISSING && quality.matches("[01459]")) { context.write(new Text(year), new IntWritable(airTemperature)); } }
From source file:cn.lhfei.hadoop.ch05.v2.MaxTemperatureMapper.java
License:Apache License
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String year = line.substring(15, 19); String temp = line.substring(87, 92); if (!missing(temp)) { int airTemperature = Integer.parseInt(temp); context.write(new Text(year), new IntWritable(airTemperature)); }/*from ww w . j a va 2 s. c o m*/ }
From source file:cn.lhfei.hadoop.ch05.v3.MaxTemperatureMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException { if (parser.isValidTemperature()) { context.write(new Text(parser.getYear()), new IntWritable(parser.getAirTemperature())); }/*from w ww . jav a 2 s . co m*/ }
From source file:cn.lhfei.hive.udf.SimpleUDFExample.java
License:Apache License
public Text evaluate(Text input) { if (input == null) return null; return new Text("Hello " + input.toString()); }
From source file:cn.lhfei.hive.udf.SimpleUDFExampleTest.java
License:Apache License
@Test public void testUDF() { SimpleUDFExample example = new SimpleUDFExample(); Assert.assertEquals("Hello world", example.evaluate(new Text("world")).toString()); }
From source file:cn.lhfei.spark.hive.udf.LowerUDF.java
License:Apache License
public Text evaluate(final Text s) { if (s == null) { return null; }// w ww . j a v a 2 s . c o m return new Text(s.toString().toLowerCase()); }
From source file:cn.uc.hadoop.mapreduce.lib.input.FileNameLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart();/* w w w . j a v a 2 s .c o m*/ end = start + split.getLength(); final Path file = split.getPath(); //ADD by qiujw key?? key = new Text(file.getName()); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); if (null == this.recordDelimiterBytes) { in = new LineReader(cIn, job); } else { in = new LineReader(cIn, job, this.recordDelimiterBytes); } start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn, decompressor), job); } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); } filePosition = fileIn; } } else { fileIn.seek(start); if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }