List of usage examples for org.apache.hadoop.fs FileUtil stat2Paths
public static Path[] stat2Paths(FileStatus[] stats)
From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java
License:Apache License
public void doTestTextBatchAppend(boolean useRawLocalFileSystem) throws Exception { LOG.debug("Starting..."); final long rollCount = 10; final long batchSize = 2; final String fileName = "FlumeData"; String newPath = testPath + "/singleTextBucket"; int totalEvents = 0; int i = 1, j = 1; // clear the test directory Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path dirPath = new Path(newPath); fs.delete(dirPath, true);/* www .j a v a 2s . c om*/ fs.mkdirs(dirPath); Context context = new Context(); // context.put("hdfs.path", testPath + "/%Y-%m-%d/%H"); context.put("hdfs.path", newPath); context.put("hdfs.filePrefix", fileName); context.put("hdfs.rollCount", String.valueOf(rollCount)); context.put("hdfs.rollInterval", "0"); context.put("hdfs.rollSize", "0"); context.put("hdfs.batchSize", String.valueOf(batchSize)); context.put("hdfs.writeFormat", "Text"); context.put("hdfs.useRawLocalFileSystem", Boolean.toString(useRawLocalFileSystem)); context.put("hdfs.fileType", "DataStream"); Configurables.configure(sink, context); Channel channel = new MemoryChannel(); Configurables.configure(channel, context); sink.setChannel(channel); sink.start(); Calendar eventDate = Calendar.getInstance(); List<String> bodies = Lists.newArrayList(); // push the event batches into channel to roll twice for (i = 1; i <= (rollCount * 10) / batchSize; i++) { Transaction txn = channel.getTransaction(); txn.begin(); for (j = 1; j <= batchSize; j++) { Event event = new SimpleEvent(); eventDate.clear(); eventDate.set(2011, i, i, i, 0); // yy mm dd String body = "Test." + i + "." + j; event.setBody(body.getBytes()); bodies.add(body); channel.put(event); totalEvents++; } txn.commit(); txn.close(); // execute sink to process the events sink.process(); } sink.stop(); // loop through all the files generated and check their contains FileStatus[] dirStat = fs.listStatus(dirPath); Path fList[] = FileUtil.stat2Paths(dirStat); // check that the roll happened correctly for the given data long expectedFiles = totalEvents / rollCount; if (totalEvents % rollCount > 0) expectedFiles++; Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length); // check the contents of the all files verifyOutputTextFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies); }
From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java
License:Apache License
@Test public void testTextAppend() throws InterruptedException, LifecycleException, EventDeliveryException, IOException { LOG.debug("Starting..."); final long rollCount = 3; final long batchSize = 2; final String fileName = "FlumeData"; String newPath = testPath + "/singleTextBucket"; int totalEvents = 0; int i = 1, j = 1; // clear the test directory Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path dirPath = new Path(newPath); fs.delete(dirPath, true);//from ww w. j a va 2s . co m fs.mkdirs(dirPath); Context context = new Context(); // context.put("hdfs.path", testPath + "/%Y-%m-%d/%H"); context.put("hdfs.path", newPath); context.put("hdfs.filePrefix", fileName); context.put("hdfs.rollCount", String.valueOf(rollCount)); context.put("hdfs.batchSize", String.valueOf(batchSize)); context.put("hdfs.writeFormat", "Text"); context.put("hdfs.fileType", "DataStream"); Configurables.configure(sink, context); Channel channel = new MemoryChannel(); Configurables.configure(channel, context); sink.setChannel(channel); sink.start(); Calendar eventDate = Calendar.getInstance(); List<String> bodies = Lists.newArrayList(); // push the event batches into channel for (i = 1; i < 4; i++) { Transaction txn = channel.getTransaction(); txn.begin(); for (j = 1; j <= batchSize; j++) { Event event = new SimpleEvent(); eventDate.clear(); eventDate.set(2011, i, i, i, 0); // yy mm dd event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis())); event.getHeaders().put("hostname", "Host" + i); String body = "Test." + i + "." + j; event.setBody(body.getBytes()); bodies.add(body); channel.put(event); totalEvents++; } txn.commit(); txn.close(); // execute sink to process the events sink.process(); } sink.stop(); // loop through all the files generated and check their contains FileStatus[] dirStat = fs.listStatus(dirPath); Path fList[] = FileUtil.stat2Paths(dirStat); // check that the roll happened correctly for the given data long expectedFiles = totalEvents / rollCount; if (totalEvents % rollCount > 0) expectedFiles++; Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length); verifyOutputTextFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies); }
From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java
License:Apache License
@Test public void testAvroAppend() throws InterruptedException, LifecycleException, EventDeliveryException, IOException { LOG.debug("Starting..."); final long rollCount = 3; final long batchSize = 2; final String fileName = "FlumeData"; String newPath = testPath + "/singleTextBucket"; int totalEvents = 0; int i = 1, j = 1; // clear the test directory Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path dirPath = new Path(newPath); fs.delete(dirPath, true);/*w ww.jav a 2 s . c o m*/ fs.mkdirs(dirPath); Context context = new Context(); // context.put("hdfs.path", testPath + "/%Y-%m-%d/%H"); context.put("hdfs.path", newPath); context.put("hdfs.filePrefix", fileName); context.put("hdfs.rollCount", String.valueOf(rollCount)); context.put("hdfs.batchSize", String.valueOf(batchSize)); context.put("hdfs.writeFormat", "Text"); context.put("hdfs.fileType", "DataStream"); context.put("serializer", "AVRO_EVENT"); Configurables.configure(sink, context); Channel channel = new MemoryChannel(); Configurables.configure(channel, context); sink.setChannel(channel); sink.start(); Calendar eventDate = Calendar.getInstance(); List<String> bodies = Lists.newArrayList(); // push the event batches into channel for (i = 1; i < 4; i++) { Transaction txn = channel.getTransaction(); txn.begin(); for (j = 1; j <= batchSize; j++) { Event event = new SimpleEvent(); eventDate.clear(); eventDate.set(2011, i, i, i, 0); // yy mm dd event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis())); event.getHeaders().put("hostname", "Host" + i); String body = "Test." + i + "." + j; event.setBody(body.getBytes()); bodies.add(body); channel.put(event); totalEvents++; } txn.commit(); txn.close(); // execute sink to process the events sink.process(); } sink.stop(); // loop through all the files generated and check their contains FileStatus[] dirStat = fs.listStatus(dirPath); Path fList[] = FileUtil.stat2Paths(dirStat); // check that the roll happened correctly for the given data long expectedFiles = totalEvents / rollCount; if (totalEvents % rollCount > 0) expectedFiles++; Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length); verifyOutputAvroFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies); }
From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java
License:Apache License
@Test public void testSimpleAppend() throws InterruptedException, LifecycleException, EventDeliveryException, IOException { LOG.debug("Starting..."); final String fileName = "FlumeData"; final long rollCount = 5; final long batchSize = 2; final int numBatches = 4; String newPath = testPath + "/singleBucket"; int totalEvents = 0; int i = 1, j = 1; // clear the test directory Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path dirPath = new Path(newPath); fs.delete(dirPath, true);/*w w w.j av a2s. c o m*/ fs.mkdirs(dirPath); Context context = new Context(); context.put("hdfs.path", newPath); context.put("hdfs.filePrefix", fileName); context.put("hdfs.rollCount", String.valueOf(rollCount)); context.put("hdfs.batchSize", String.valueOf(batchSize)); Configurables.configure(sink, context); Channel channel = new MemoryChannel(); Configurables.configure(channel, context); sink.setChannel(channel); sink.start(); Calendar eventDate = Calendar.getInstance(); List<String> bodies = Lists.newArrayList(); // push the event batches into channel for (i = 1; i < numBatches; i++) { Transaction txn = channel.getTransaction(); txn.begin(); for (j = 1; j <= batchSize; j++) { Event event = new SimpleEvent(); eventDate.clear(); eventDate.set(2011, i, i, i, 0); // yy mm dd event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis())); event.getHeaders().put("hostname", "Host" + i); String body = "Test." + i + "." + j; event.setBody(body.getBytes()); bodies.add(body); channel.put(event); totalEvents++; } txn.commit(); txn.close(); // execute sink to process the events sink.process(); } sink.stop(); // loop through all the files generated and check their contains FileStatus[] dirStat = fs.listStatus(dirPath); Path fList[] = FileUtil.stat2Paths(dirStat); // check that the roll happened correctly for the given data long expectedFiles = totalEvents / rollCount; if (totalEvents % rollCount > 0) expectedFiles++; Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length); verifyOutputSequenceFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies); }
From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java
License:Apache License
@Test public void testSimpleAppendLocalTime() throws InterruptedException, LifecycleException, EventDeliveryException, IOException { final long currentTime = System.currentTimeMillis(); Clock clk = new Clock() { @Override// w ww . j a va 2s. c o m public long currentTimeMillis() { return currentTime; } }; LOG.debug("Starting..."); final String fileName = "FlumeData"; final long rollCount = 5; final long batchSize = 2; final int numBatches = 4; String newPath = testPath + "/singleBucket/%s"; String expectedPath = testPath + "/singleBucket/" + String.valueOf(currentTime / 1000); int totalEvents = 0; int i = 1, j = 1; // clear the test directory Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path dirPath = new Path(expectedPath); fs.delete(dirPath, true); fs.mkdirs(dirPath); Context context = new Context(); context.put("hdfs.path", newPath); context.put("hdfs.filePrefix", fileName); context.put("hdfs.rollCount", String.valueOf(rollCount)); context.put("hdfs.batchSize", String.valueOf(batchSize)); context.put("hdfs.useLocalTimeStamp", String.valueOf(true)); Configurables.configure(sink, context); Channel channel = new MemoryChannel(); Configurables.configure(channel, context); sink.setChannel(channel); sink.setBucketClock(clk); sink.start(); Calendar eventDate = Calendar.getInstance(); List<String> bodies = Lists.newArrayList(); // push the event batches into channel for (i = 1; i < numBatches; i++) { Transaction txn = channel.getTransaction(); txn.begin(); for (j = 1; j <= batchSize; j++) { Event event = new SimpleEvent(); eventDate.clear(); eventDate.set(2011, i, i, i, 0); // yy mm dd event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis())); event.getHeaders().put("hostname", "Host" + i); String body = "Test." + i + "." + j; event.setBody(body.getBytes()); bodies.add(body); channel.put(event); totalEvents++; } txn.commit(); txn.close(); // execute sink to process the events sink.process(); } sink.stop(); // loop through all the files generated and check their contains FileStatus[] dirStat = fs.listStatus(dirPath); Path fList[] = FileUtil.stat2Paths(dirStat); // check that the roll happened correctly for the given data long expectedFiles = totalEvents / rollCount; if (totalEvents % rollCount > 0) expectedFiles++; Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length); verifyOutputSequenceFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies); // The clock in bucketpath is static, so restore the real clock sink.setBucketClock(new SystemClock()); }
From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java
License:Apache License
private void slowAppendTestHelper(long appendTimeout) throws InterruptedException, IOException, LifecycleException, EventDeliveryException, IOException { final String fileName = "FlumeData"; final long rollCount = 5; final long batchSize = 2; final int numBatches = 2; String newPath = testPath + "/singleBucket"; int totalEvents = 0; int i = 1, j = 1; // clear the test directory Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path dirPath = new Path(newPath); fs.delete(dirPath, true);/* w w w . j a v a 2s . co m*/ fs.mkdirs(dirPath); // create HDFS sink with slow writer HDFSTestWriterFactory badWriterFactory = new HDFSTestWriterFactory(); sink = new HDFSEventSink(badWriterFactory); Context context = new Context(); context.put("hdfs.path", newPath); context.put("hdfs.filePrefix", fileName); context.put("hdfs.rollCount", String.valueOf(rollCount)); context.put("hdfs.batchSize", String.valueOf(batchSize)); context.put("hdfs.fileType", HDFSTestWriterFactory.TestSequenceFileType); context.put("hdfs.appendTimeout", String.valueOf(appendTimeout)); Configurables.configure(sink, context); Channel channel = new MemoryChannel(); Configurables.configure(channel, context); sink.setChannel(channel); sink.start(); Calendar eventDate = Calendar.getInstance(); List<String> bodies = Lists.newArrayList(); // push the event batches into channel for (i = 0; i < numBatches; i++) { Transaction txn = channel.getTransaction(); txn.begin(); for (j = 1; j <= batchSize; j++) { Event event = new SimpleEvent(); eventDate.clear(); eventDate.set(2011, i, i, i, 0); // yy mm dd event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis())); event.getHeaders().put("hostname", "Host" + i); event.getHeaders().put("slow", "1500"); String body = "Test." + i + "." + j; event.setBody(body.getBytes()); bodies.add(body); channel.put(event); totalEvents++; } txn.commit(); txn.close(); // execute sink to process the events sink.process(); } sink.stop(); // loop through all the files generated and check their contains FileStatus[] dirStat = fs.listStatus(dirPath); Path fList[] = FileUtil.stat2Paths(dirStat); // check that the roll happened correctly for the given data // Note that we'll end up with two files with only a head long expectedFiles = totalEvents / rollCount; if (totalEvents % rollCount > 0) expectedFiles++; Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length); verifyOutputSequenceFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies); }
From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java
License:Apache License
@Test public void testCloseOnIdle() throws IOException, EventDeliveryException, InterruptedException { String hdfsPath = testPath + "/idleClose"; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path dirPath = new Path(hdfsPath); fs.delete(dirPath, true);/*from www. ja v a2s .c o m*/ fs.mkdirs(dirPath); Context context = new Context(); context.put("hdfs.path", hdfsPath); /* * All three rolling methods are disabled so the only * way a file can roll is through the idle timeout. */ context.put("hdfs.rollCount", "0"); context.put("hdfs.rollSize", "0"); context.put("hdfs.rollInterval", "0"); context.put("hdfs.batchSize", "2"); context.put("hdfs.idleTimeout", "1"); Configurables.configure(sink, context); Channel channel = new MemoryChannel(); Configurables.configure(channel, context); sink.setChannel(channel); sink.start(); Transaction txn = channel.getTransaction(); txn.begin(); for (int i = 0; i < 10; i++) { Event event = new SimpleEvent(); event.setBody(("test event " + i).getBytes()); channel.put(event); } txn.commit(); txn.close(); sink.process(); sink.process(); Thread.sleep(1001); // previous file should have timed out now // this can throw BucketClosedException(from the bucketWriter having // closed),this is not an issue as the sink will retry and get a fresh // bucketWriter so long as the onClose handler properly removes // bucket writers that were closed. sink.process(); sink.process(); Thread.sleep(500); // shouldn't be enough for a timeout to occur sink.process(); sink.process(); sink.stop(); FileStatus[] dirStat = fs.listStatus(dirPath); Path[] fList = FileUtil.stat2Paths(dirStat); Assert.assertEquals("Incorrect content of the directory " + StringUtils.join(fList, ","), 2, fList.length); Assert.assertTrue(!fList[0].getName().endsWith(".tmp") && !fList[1].getName().endsWith(".tmp")); fs.close(); }
From source file:org.apache.giraph.aggregators.VarianceAggregatorDynamic.java
License:Apache License
@Override public void aggregate(AggregateMessageCustome value) { //int megaStepInd = value.getMegaSlotInd(); //if(value.getTweetList().get(0).getId() == -1) // return;/*from w w w . j a va 2 s . com*/ //if(neighborList == null) // neighborList = new ArrayList<int[]> (); //if(simList == null) // variance = new ArrayList<float> (); //if(tweets == null) // tweets = new ArrayList<Tweet>(); LinkedList<Tweet> tweetList = value.getTweetList(); LinkedList<Tweet> newList = new LinkedList<Tweet>(); if (tweetList.size() == 1) { // worker aggregate AggregateMessageCustome prev = getAggregatedValue(); LinkedList<Tweet> newList1 = prev.getTweetList(); Tweet currTweet = tweetList.get(0); boolean checkNext = false; //LinkedList<Tweet> missedTweets = new LinkedList<Tweet>(); for (Tweet tweet : newList1) { if (newList.size() == factor * topicNumber) { break; } if (!checkNext) { if (tweet.getVariance() >= currTweet.getVariance()) { if (tweet.getConflictList().contains(currTweet.getId())) { newList = newList1; break; } newList.add(tweet); } else { // should add the currTweet newList.add(currTweet); checkNext = true; if (newList.size() == factor * topicNumber) { break; } if (!currTweet.getConflictList().contains(tweet.getId())) { newList.add(tweet); } // else { // missedTweets.add(tweet); //} } } else { if (!currTweet.getConflictList().contains(tweet.getId())) { newList.add(tweet); } //else { //missedTweets.add(tweet); //} } } if ((newList.size() < factor * topicNumber) && !checkNext) { newList.add(currTweet); } //Iterator<Tweet> missedIter = missedTweets.iterator(); //while (newList.size() < factor * topicNumber) { // if(!missedIter.hasNext()) // break; // newList.add(missedIter.next()); //} } if (tweetList.size() > 1) { // master aggregate AggregateMessageCustome prev = getAggregatedValue(); newList = prev.getTweetList(); for (Tweet tweet : tweetList) { newList.add(tweet); } stage++; System.out.println("stage value = " + stage); } getAggregatedValue().setTweetList(newList); //getAggregatedValue().setMegaSlotInd(megaStepInd); if (tweetList.size() > 1 && (stage % workerNum == 0) && (stage != 0)) { try { //System.out.println("About to write in the aggregator"); //Path pt = new Path("/user/exp/ahmed/20k_10k/output/timeslot-0"); //int numMegaStep = (stage/workerNum) - 1; FileSystem fs = FileSystem.get(new Configuration()); FileStatus[] fileStatus = fs.listStatus(new Path("/user/exp/ahmed/50k_sliding_20k_output")); Path[] paths = FileUtil.stat2Paths(fileStatus); int numMegaStep = paths.length; //System.out.println("Current numMegaStep = "+numMegaStep); String outFileName = "/user/exp/ahmed/50k_sliding_20k_output/timeslot-" + numMegaStep + ".txt"; Path pt = new Path(outFileName); System.out.println("Writing output to " + outFileName + ". pt = " + pt); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))); Collections.sort(newList); /* String varFileName = "/user/exp/ahmed/20k_sliding_10k_output/var-"+ numMegaStep + ".txt"; Path varpt = new Path(varFileName); System.out.println("Writing output to " + varFileName + ". pt = " + varpt); BufferedWriter varbr = new BufferedWriter(new OutputStreamWriter(fs.create(varpt,true))); for(Tweet tweet : newList){ varbr.write(tweet.getId()+"\t"+tweet.getVariance()); varbr.newLine(); } varbr.close(); */ HashSet<Integer> selectedTweets = new HashSet<Integer>(); int lastTopic = -1; int topicCount = 0; Iterator<Tweet> tweetItr = newList.iterator(); Tweet nextTweet = tweetItr.next(); Loop: while (topicCount < topicNumber) { while ((nextTweet.getConflictList().contains(lastTopic)) && (lastTopic != -1)) { if (!tweetItr.hasNext()) break Loop; nextTweet = tweetItr.next(); } lastTopic = nextTweet.getId(); selectedTweets.add(lastTopic); if (!tweetItr.hasNext()) break; nextTweet = tweetItr.next(); topicCount++; } // get tweet text String fileName = "/user/exp/ahmed/50k_sliding_20k_dynamic_tweets/tweets-" + numMegaStep; Path ptRead = new Path(fileName); FSDataInputStream reader = fs.open(ptRead); System.out.println("In agg. Getting tweet text from " + fileName + ". reader = " + reader); String line = null; //System.out.println("Reader created " + reader + ", content = " + reader.readLine()); while ((line = reader.readLine()) != null) { String[] splits = line.split("\t"); int id = Integer.parseInt(splits[0]); //System.out.println("About to write"); if (selectedTweets.contains(id) && splits.length >= 2) { //System.out.println(splits[1]); br.write(splits[1]); br.newLine(); } } reader.close(); br.close(); } catch (IOException ex) { System.out.println("In aggregator: " + ex.getMessage()); } } //vertexes.add(vertexId); //neighborList.add(neighbor); //variance.add(value.getVar()); //System.out.println(tweets.size()); }
From source file:org.apache.hama.bsp.CombineFileInputFormat.java
License:Apache License
@Override public InputSplit[] getSplits(BSPJob bspJob, int numSplits) throws IOException { Configuration job = bspJob.getConfiguration(); long minSizeNode = 0; long minSizeRack = 0; long maxSize = 0; // the values specified by setxxxSplitSize() takes precedence over the // values that might have been specified in the config if (minSplitSizeNode != 0) { minSizeNode = minSplitSizeNode;/*w w w.java 2 s. c o m*/ } else { minSizeNode = job.getLong("mapred.min.split.size.per.node", 0); } if (minSplitSizeRack != 0) { minSizeRack = minSplitSizeRack; } else { minSizeRack = job.getLong("mapred.min.split.size.per.rack", 0); } if (maxSplitSize != 0) { maxSize = maxSplitSize; } else { maxSize = job.getLong("mapred.max.split.size", 0); } if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) { throw new IOException("Minimum split size pernode " + minSizeNode + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) { throw new IOException("Minimum split size per rack" + minSizeRack + " cannot be larger than maximum split size " + maxSize); } if (minSizeRack != 0 && minSizeNode > minSizeRack) { throw new IOException("Minimum split size per node" + minSizeNode + " cannot be smaller than minimum split size per rack " + minSizeRack); } // all the files in input set Path[] paths = FileUtil.stat2Paths(listStatus(bspJob)); List<CombineFileSplit> splits = new ArrayList<CombineFileSplit>(); if (paths.length == 0) { return splits.toArray(new CombineFileSplit[splits.size()]); } // In one single iteration, process all the paths in a single pool. // Processing one pool at a time ensures that a split contans paths // from a single pool only. for (MultiPathFilter onepool : pools) { ArrayList<Path> myPaths = new ArrayList<Path>(); // pick one input path. If it matches all the filters in a pool, // add it to the output set for (int i = 0; i < paths.length; i++) { if (paths[i] == null) { // already processed continue; } Path p = new Path(paths[i].toUri().getPath()); if (onepool.accept(p)) { myPaths.add(paths[i]); // add it to my output set paths[i] = null; // already processed } } // create splits for all files in this pool. getMoreSplits(bspJob, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); } // Finally, process all paths that do not belong to any pool. ArrayList<Path> myPaths = new ArrayList<Path>(); for (Path path : paths) { if (path == null) { // already processed continue; } myPaths.add(path); } // create splits for all files that are not in any pool. getMoreSplits(bspJob, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits); // free up rackToNodes map rackToNodes.clear(); return splits.toArray(new CombineFileSplit[splits.size()]); }
From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java
License:Apache License
public static HmmModel CreateHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath, Configuration conf) throws IOException { log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri()); Vector initialProbabilities = new DenseVector(nrOfHiddenStates); Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates); Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates); // Get the path location where the seq files encoding model are stored Path modelFilesPath = new Path(modelPath, "*"); log.info("Create Hmm Model. ModelFiles Path = {}", modelFilesPath.toUri()); Collection<Path> result = new ArrayList<Path>(); // get all filtered file names in result list FileSystem fs = modelFilesPath.getFileSystem(conf); log.info("Create Hmm Model. File System = {}", fs); FileStatus[] matches = fs.listStatus( FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())), PathFilters.partFilter());//from w ww . j a v a2 s . c o m for (FileStatus match : matches) { log.info("CreateHmmmModel Adding File Match {}", match.getPath().toString()); result.add(fs.makeQualified(match.getPath())); } // iterate through the result path list for (Path path : result) { for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true, conf)) { Text key = (Text) pair.getFirst(); log.info("CreateHmmModel Matching Seq File Key = {}", key); MapWritable valueMap = pair.getSecond(); if (key.charAt(0) == 'I') { // initial distribution stripe for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Initial Prob Adding Key, Value = ({} {})", ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); initialProbabilities.set(((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == 'T') { // transition distribution stripe // key is of the form TRANSIT_0, TRANSIT_1 etc // the number after _ is the state ID at char number 11 int stateID = Character.getNumericValue(key.charAt(8)); log.info("CreateHmmModel stateID = key.charAt(8) = {}", stateID); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Transition Matrix ({}, {}) = {}", new Object[] { stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() }); transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else if (key.charAt(0) == 'E') { // emission distribution stripe // key is of the form EMIT_0, EMIT_1 etc // the number after _ is the state ID at char number 5 int stateID = Character.getNumericValue(key.charAt(5)); for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) { log.info("CreateHmmModel Emission Matrix ({}, {}) = {}", new Object[] { stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() }); emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get()); } } else { throw new IllegalStateException("Error creating HmmModel from Sequence File Path"); } } } HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities); HmmUtils.validate(model); return model; }