List of usage examples for org.apache.hadoop.mapred TextInputFormat TextInputFormat
TextInputFormat
From source file:RunText.java
License:Apache License
public static void main(String[] args) throws Exception { o = new Options(); JCommander jc = null;/*from w w w . j a v a 2s . c om*/ try { jc = new JCommander(o, args); jc.setProgramName("./runText"); } catch (ParameterException e) { System.out.println(e.getMessage()); String[] valid = { "-p", "path", "-d", "delimiter", "v", "value", "-i", "index" }; new JCommander(o, valid).usage(); System.exit(-1); } if (o.help) { jc.usage(); System.exit(0); } path = new Path(o.path); delim = o.delimiter.getBytes()[0]; toFind = o.value; index = o.index; numThreads = o.threads; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); TextInputFormat format = new TextInputFormat(); long len = fs.getFileStatus(path).getLen() / numThreads; List<Thread> threads = Lists.newArrayList(); for (int i = 0; i < numThreads; i++) { FileSplit split = new FileSplit(path, i * len, len, new String[] { "" }); threads.add(new Thread(new RunText(split, format))); } runningThreads = new AtomicInteger(numThreads); for (Thread t : threads) { t.start(); } int prev = 0; int current; long t1 = System.nanoTime(); long t2; while (runningThreads.get() > 0) { Thread.sleep(5000); current = totalCount.get(); t2 = System.nanoTime(); System.out.println(String.format("%f records/sec", (current - prev) * 1e9 / (t2 - t1))); t1 = t2; prev = current; } for (Thread t : threads) { t.join(); } fs.close(); }
From source file:TestTextInputFormat.java
License:Open Source License
public static void main(String[] argv) throws IOException, SerDeException { try {//from ww w. j a va 2 s .c o m if (argv.length != 2) { System.out.println("TestTextInputFormat <input> <output>"); System.exit(-1); } JobConf conf = new JobConf(TestTextInputFormat.class); conf.setJobName("TestTextInputFormat"); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Unit.Record.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(FormatStorageOutputFormat.class); conf.set("mapred.output.compress", "flase"); conf.set("mapred.input.dir", argv[0]); LazySimpleSerDe serDe = initSerDe(conf); LazySimpleStructObjectInspector oi = (LazySimpleStructObjectInspector) serDe.getObjectInspector(); List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); FileInputFormat.setInputPaths(conf, argv[0]); Path outputPath = new Path(argv[1]); FileOutputFormat.setOutputPath(conf, outputPath); InputFormat inputFormat = new TextInputFormat(); ((TextInputFormat) inputFormat).configure(conf); InputSplit[] inputSplits = inputFormat.getSplits(conf, 1); if (inputSplits.length == 0) { System.out.println("inputSplits is empty"); return; } else { System.out.println("get Splits:" + inputSplits.length); } int totalDelay = 0; RecordReader<WritableComparable, Writable> currRecReader = null; for (int i = 0; i < inputSplits.length; i++) { currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL); WritableComparable key; Writable value; key = currRecReader.createKey(); value = currRecReader.createValue(); long begin = System.currentTimeMillis(); int count = 0; while (currRecReader.next(key, value)) { Object row = serDe.deserialize((Text) value); oi.getStructFieldsDataAsList(row); count++; } long end = System.currentTimeMillis(); long delay = (end - begin) / 1000; totalDelay += delay; System.out.println(count + " record read over, delay " + delay + " s"); } System.out.println("total delay:" + totalDelay); return; } catch (Exception e) { e.printStackTrace(); System.out.println("get exception:" + e.getMessage()); } }
From source file:com.cloudera.knittingboar.io.InputRecordsSplit.java
License:Apache License
public InputRecordsSplit(JobConf jobConf, InputSplit split) throws IOException { this.jobConf = jobConf; this.split = split; this.input_format = new TextInputFormat(); // RecordReader<LongWritable, Text> reader = // format.getRecordReader(splits[x], job, reporter); this.reader = input_format.getRecordReader(this.split, this.jobConf, voidReporter); this.key = reader.createKey(); // Text value = reader.createValue(); }
From source file:com.cloudera.knittingboar.io.TestInputRecordsSplit.java
License:Apache License
/** * create an InputRecordSplit and then read some records * /*from www.j a va2 s.co m*/ * - make sure we maintain split discipline * @throws IOException * */ public void testReadSplitViaInputRecordsSplit() throws IOException { // InputRecordsSplit(JobConf jobConf, InputSplit split) // needs to get a jobConf from somewhere, under the hood // needs a split calculated from the aforementioned jobConf JobConf job = new JobConf(defaultConf); Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit.txt"); int tmp_file_size = 2000; long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); Writer writer = new OutputStreamWriter(localFs.create(file)); try { for (int i = 0; i < tmp_file_size; i++) { writer.write( "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99"); writer.write("\n"); } } finally { writer.close(); } System.out.println("file write complete, wrote " + tmp_file_size + " recs"); // A reporter that does nothing Reporter reporter = Reporter.NULL; System.out.println("> setting splits for: " + workDir); // localFs.delete(workDir, true); FileInputFormat.setInputPaths(job, file); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); int numSplits = 1; InputSplit[] splits = format.getSplits(job, numSplits); LOG.info("requested " + numSplits + " splits, splitting: got = " + splits.length); System.out.println("---- debug splits --------- "); //InputSplit test_split = null; int total_read = 0; for (int x = 0; x < splits.length; x++) { System.out.println("> Split [" + x + "]: " + splits[x].getLength()); int count = 0; InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]); while (custom_reader.next(value)) { count++; // } System.out.println("read: " + count + " records for split " + x); total_read += count; } // for each split System.out.println("--------- total read across all splits: " + total_read); assertEquals(tmp_file_size, total_read); }
From source file:com.cloudera.knittingboar.io.TestInputRecordsSplit.java
License:Apache License
public void testReadSplitViaInputRecordsSplit_SplitReset() throws IOException { // InputRecordsSplit(JobConf jobConf, InputSplit split) // needs to get a jobConf from somewhere, under the hood // needs a split calculated from the aforementioned jobConf JobConf job = new JobConf(defaultConf); Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit_SplitReset"); int tmp_file_size = 2000; long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); Writer writer = new OutputStreamWriter(localFs.create(file)); try {// w ww .j ava 2 s . co m for (int i = 0; i < tmp_file_size; i++) { writer.write( "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99"); writer.write("\n"); } } finally { writer.close(); } System.out.println("file write complete, wrote " + tmp_file_size + " recs"); // A reporter that does nothing Reporter reporter = Reporter.NULL; // localFs.delete(workDir, true); FileInputFormat.setInputPaths(job, file); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); int numSplits = 1; InputSplit[] splits = format.getSplits(job, numSplits); LOG.info("requested " + numSplits + " splits, splitting: got = " + splits.length); System.out.println("---- testReadSplitViaInputRecordsSplit_SplitReset: debug splits --------- "); int total_read = 0; System.out.println("> Split [0]: " + splits[0].getLength()); int count = 0; InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[0]); while (custom_reader.next(value)) { count++; } System.out.println("read: " + count + " records for split " + 0); int count_reset = 0; custom_reader.ResetToStartOfSplit(); while (custom_reader.next(value)) { count_reset++; } System.out.println("read: " + count_reset + " records for split after reset " + 0); assertEquals(count, count_reset); }
From source file:com.cloudera.knittingboar.io.TestSplitCalcs.java
License:Apache License
/** * /*from w w w . j a v a 2s. c om*/ * - use the TextInputFormat.getSplits() to test pulling split info * @throws IOException * */ public void testGetSplits() throws IOException { TextInputFormat input = new TextInputFormat(); JobConf job = new JobConf(defaultConf); Path file = new Path(workDir, "testGetSplits.txt"); int tmp_file_size = 200000; long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); Writer writer = new OutputStreamWriter(localFs.create(file)); try { for (int i = 0; i < tmp_file_size; i++) { writer.write( "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99"); writer.write("\n"); } } finally { writer.close(); } System.out.println("file write complete"); // A reporter that does nothing Reporter reporter = Reporter.NULL; // localFs.delete(workDir, true); FileInputFormat.setInputPaths(job, file); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); int numSplits = 1; InputSplit[] splits = format.getSplits(job, numSplits); LOG.info("requested " + numSplits + " splits, splitting: got = " + splits.length); assertEquals(2, splits.length); System.out.println("---- debug splits --------- "); for (int x = 0; x < splits.length; x++) { System.out.println("> Split [" + x + "]: " + splits[x].getLength() + ", " + splits[x].toString() + ", " + splits[x].getLocations()[0]); RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[x], job, reporter); try { int count = 0; while (reader.next(key, value)) { if (count == 0) { System.out.println("first: " + value.toString()); assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p")); } count++; } System.out.println("last: " + value.toString()); assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p")); } finally { reader.close(); } } // for each split }
From source file:com.cloudera.knittingboar.metrics.Test20NewsApplyModel.java
License:Apache License
public InputSplit[] generateDebugSplits(Path input_path, JobConf job) { long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); // ---- set where we'll read the input files from ------------- FileInputFormat.setInputPaths(job, input_path); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job);//from w w w.j av a 2s .c o m int numSplits = 1; InputSplit[] splits = null; try { splits = format.getSplits(job, numSplits); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return splits; }
From source file:com.cloudera.knittingboar.metrics.Test20NewsNoSaveModel.java
License:Apache License
public InputSplit[] generateDebugSplits(Path input_path, JobConf job) { long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); FileInputFormat.setInputPaths(job, input_path); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job);//w w w. j a va 2s .com int numSplits = 1; InputSplit[] splits = null; try { splits = format.getSplits(job, numSplits); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return splits; }
From source file:com.cloudera.knittingboar.metrics.TestSaveLoadModel.java
License:Apache License
public InputSplit[] generateDebugSplits(Path input_path, JobConf job) { long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); FileInputFormat.setInputPaths(job, input_path); TextInputFormat format = new TextInputFormat(); format.configure(job);// w w w.ja v a 2 s.c o m int numSplits = 1; InputSplit[] splits = null; try { splits = format.getSplits(job, numSplits); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return splits; }
From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsCustomRecordParseOLRRun.java
License:Apache License
@Test public void testRecordFactoryOnDatasetShard() throws Exception { // TODO a test with assertions is not a test // p.270 ----- metrics to track lucene's parsing mechanics, progress, // performance of OLR ------------ double averageLL = 0.0; double averageCorrect = 0.0; int k = 0;// ww w. jav a2s . c o m double step = 0.0; int[] bumps = new int[] { 1, 2, 5 }; TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t"); // rec_factory.setClassSplitString("\t"); JobConf job = new JobConf(defaultConf); long block_size = localFs.getDefaultBlockSize(workDir); LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB"); // matches the OLR setup on p.269 --------------- // stepOffset, decay, and alpha --- describe how the learning rate decreases // lambda: amount of regularization // learningRate: amount of initial learning rate @SuppressWarnings("resource") OnlineLogisticRegression learningAlgorithm = new OnlineLogisticRegression(20, FEATURES, new L1()).alpha(1) .stepOffset(1000).decayExponent(0.9).lambda(3.0e-5).learningRate(20); FileInputFormat.setInputPaths(job, workDir); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); Text value = new Text(); int numSplits = 1; InputSplit[] splits = format.getSplits(job, numSplits); LOG.info("requested " + numSplits + " splits, splitting: got = " + splits.length); LOG.info("---- debug splits --------- "); rec_factory.Debug(); int total_read = 0; for (int x = 0; x < splits.length; x++) { LOG.info("> Split [" + x + "]: " + splits[x].getLength()); int count = 0; InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]); while (custom_reader.next(value)) { Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES); int actual = rec_factory.processLine(value.toString(), v); String ng = rec_factory.GetNewsgroupNameByID(actual); // calc stats --------- double mu = Math.min(k + 1, 200); double ll = learningAlgorithm.logLikelihood(actual, v); averageLL = averageLL + (ll - averageLL) / mu; Vector p = new DenseVector(20); learningAlgorithm.classifyFull(p, v); int estimated = p.maxValueIndex(); int correct = (estimated == actual ? 1 : 0); averageCorrect = averageCorrect + (correct - averageCorrect) / mu; learningAlgorithm.train(actual, v); k++; int bump = bumps[(int) Math.floor(step) % bumps.length]; int scale = (int) Math.pow(10, Math.floor(step / bumps.length)); if (k % (bump * scale) == 0) { step += 0.25; LOG.info(String.format("%10d %10.3f %10.3f %10.2f %s %s", k, ll, averageLL, averageCorrect * 100, ng, rec_factory.GetNewsgroupNameByID(estimated))); } learningAlgorithm.close(); count++; } LOG.info("read: " + count + " records for split " + x); total_read += count; } // for each split LOG.info("total read across all splits: " + total_read); rec_factory.Debug(); }