List of usage examples for com.google.common.collect Ordering natural
@GwtCompatible(serializable = true) @SuppressWarnings("unchecked") public static <C extends Comparable> Ordering<C> natural()
From source file:com.evidon.areweprivateyet.ValueComparableMap.java
public static void main(String[] args) { TreeMap<String, Integer> map = new ValueComparableMap<String, Integer>(Ordering.natural()); map.put("a", 5); map.put("b", 1); map.put("c", 3); assertEquals("b", map.firstKey()); assertEquals("a", map.lastKey()); map.put("d", 0); assertEquals("d", map.firstKey()); //ensure it's still a map (by overwriting a key, but with a new value) map.put("d", 2); assertEquals("b", map.firstKey()); //Ensure multiple values do not clobber keys map.put("e", 2); assertEquals(5, map.size());/* w w w . j a va 2 s . c om*/ assertEquals(2, (int) map.get("e")); assertEquals(2, (int) map.get("d")); }
From source file:RandomWalk.java
public static void main(String[] args) { int numNodes = 3; if (args.length > 0) numNodes = Integer.parseInt(args[0]); System.out.printf("Generating a random graph with %d nodes...\n", numNodes); DirectedGraph graph = TestGraphs.generateRandomGraph(numNodes, Math.min(10, numNodes)); System.out.printf("Generated a random directed graph with %s nodes and %s edges.\n", graph.nodeCount(), graph.edgeCount());/* w ww . ja v a2 s .c o m*/ // Generate walk parameters long numSteps = 1000 * 1000; final scala.Option<Integer> wpNone = scala.Option.apply(null); final scala.Option<Integer> wpTwo = scala.Option.apply(2); RandomWalkParams walkParams = new RandomWalkParams(numSteps, 0.1, wpNone, wpTwo, wpNone, false, GraphDir.OutDir(), false, false); GraphUtils graphUtils = new GraphUtils(graph); // Do the walk and measure how long it took System.out.printf("Now doing a random walk of %s steps from Node 0...\n", numSteps); long startTime = System.nanoTime(); Tuple2<Int2IntMap, scala.Option<Int2ObjectMap<Object2IntMap<DirectedPath>>>> lm = graphUtils .calculatePersonalizedReputation(0, walkParams); long endTime = System.nanoTime(); Int2IntMap neighbors = lm._1; System.out.printf("Random walk visited %s nodes in %s ms:\n", neighbors.size(), (endTime - startTime) / 1000000); // Sort neighbors (or nodes) in descending number of visits and take the top 10 neighbors List<Integer> topNeighbors = Ordering.natural().onResultOf(Functions.forMap(neighbors)).reverse() .immutableSortedCopy(neighbors.keySet()); if (topNeighbors.size() > 10) topNeighbors = topNeighbors.subList(0, 10); // Print the top 10 neighbors (and paths) System.out.printf("%8s%10s\t%s\n", "NodeID", "#Visits", "Top 2 Paths with counts"); for (int id : topNeighbors) { int numVisits = neighbors.get(id); System.out.printf("%8s%10s\t", id, numVisits); if (lm._2.isDefined()) { // If Option is not None Object2IntMap<DirectedPath> paths = lm._2.get().get(id); int remaining = paths.size(); for (Map.Entry<DirectedPath, Integer> ef : paths.entrySet()) { // Print a directed path and #visits along that path int[] nodes = ef.getKey().nodes(); for (int i = 0; i < nodes.length; i++) { if (i != 0) System.out.printf("->%d", nodes[i]); else System.out.printf("%d", nodes[i]); } System.out.printf(" (%d)", ef.getValue()); if (remaining > 1) System.out.printf(" | "); remaining--; } } System.out.println(); } }
From source file:RandomWalkJava.java
public static void main(String[] args) { int numNodes = 3; if (args.length > 0) numNodes = Integer.parseInt(args[0]); System.out.printf("Generating a random graph with %d nodes...\n", numNodes); DirectedGraph graph = TestGraphs.generateRandomGraph(numNodes, TestGraphs.getProbEdgeRandomDirected(numNodes, Math.min(10, numNodes)), StoredGraphDir.BothInOut()); System.out.printf("Generated a random directed graph with %s nodes and %s edges.\n", graph.nodeCount(), graph.edgeCount());/* ww w .j a v a 2s .co m*/ // Generate walk parameters long numSteps = 1000 * 1000; final scala.Option<Object> wpNone = scala.Option.apply(null); final scala.Option<Object> wpTwo = scala.Option.apply((Object) 2); RandomWalkParams walkParams = new RandomWalkParams(numSteps, 0.1, wpNone, wpTwo, wpNone, false, GraphDir.OutDir(), false, false); GraphUtils graphUtils = new GraphUtils(graph); // Do the walk and measure how long it took System.out.printf("Now doing a random walk of %s steps from Node 0...\n", numSteps); long startTime = System.nanoTime(); Tuple2<Int2IntMap, scala.Option<Int2ObjectMap<Object2IntMap<DirectedPath>>>> lm = graphUtils .calculatePersonalizedReputation(0, walkParams); long endTime = System.nanoTime(); Int2IntMap neighbors = lm._1; System.out.printf("Random walk visited %s nodes in %s ms:\n", neighbors.size(), (endTime - startTime) / 1000000); // Sort neighbors (or nodes) in descending number of visits and take the top 10 neighbors List<Integer> topNeighbors = Ordering.natural().onResultOf(Functions.forMap(neighbors)).reverse() .immutableSortedCopy(neighbors.keySet()); if (topNeighbors.size() > 10) topNeighbors = topNeighbors.subList(0, 10); // Print the top 10 neighbors (and paths) System.out.printf("%8s%10s\t%s\n", "NodeID", "#Visits", "Top 2 Paths with counts"); for (int id : topNeighbors) { int numVisits = neighbors.get(id); System.out.printf("%8s%10s\t", id, numVisits); if (lm._2.isDefined()) { // If Option is not None Object2IntMap<DirectedPath> paths = lm._2.get().get(id); int remaining = paths.size(); for (Map.Entry<DirectedPath, Integer> ef : paths.entrySet()) { // Print a directed path and #visits along that path int[] nodes = ef.getKey().nodes(); for (int i = 0; i < nodes.length; i++) { if (i != 0) System.out.printf("->%d", nodes[i]); else System.out.printf("%d", nodes[i]); } System.out.printf(" (%d)", ef.getValue()); if (remaining > 1) System.out.printf(" | "); remaining--; } } System.out.println(); } }
From source file:org.talos.CFGScanDroid.CFGScanDroid.java
public static void main(String[] args) throws IOException { parsedArguments = new JCommanderArguments(); JCommander argParser = new JCommander(parsedArguments); // parse arguments try {/*from w w w . j ava 2 s. c om*/ argParser.parse(args); } catch (ParameterException exception) { System.err.println(exception); System.err.println("PARSE ERROR: Bad parameter"); System.out.print(parsedArguments.getUsage()); System.exit(1); } // make sure a useful set of arguments are set validateArguments(argParser); // get files from directories, one level deep List<File> fileList = getFileList(); fileList = Ordering.natural().sortedCopy(fileList); // dump sigs if (parsedArguments.dumpSignatures()) { for (File file : fileList) dumpSigs(file); // scan } else { // load signatures List<CFGSig> signatures = null; for (String sigFile : parsedArguments.getSignatureFiles()) { if (signatures == null) signatures = parseSignatures(sigFile); else signatures.addAll(parseSignatures(sigFile)); } // load raw signatures for (String sig : parsedArguments.getRawSignatures()) { if (signatures == null) signatures = new ArrayList<CFGSig>(); CFGSig cfgSig = new CFGSig(sig); signatures.add(cfgSig); } // normalize if (parsedArguments.normalize()) { for (CFGSig cfgSig : signatures) { // System.out.println("NORMALIZING SIGNATURE: " + cfgSig.getName()); // System.out.println(cfgSig.getVertexCount()); // System.out.println(cfgSig.getEdgeCount()); cfgSig.normalize(); // System.out.println(cfgSig.getVertexCount()); // System.out.println(cfgSig.getEdgeCount()); } } // for each file, scan for (File file : fileList) { ++scannedSampleCount; boolean detected = scanDexFile(file, signatures); if (detected) ++detectedSampleCount; } // print stats if (parsedArguments.printStatistics()) { System.out.println(); System.out.println("Samples Scanned:\t" + scannedSampleCount); System.out.println("Functions Scanned:\t" + scannedFunctionCount); System.out.println("Samples Detected:\t" + detectedSampleCount); for (CFGSig signature : signatures) { System.out.println(signature.getName() + ": " + signature.getDetectionCount()); } } if (parsedArguments.outputGraph()) { Graph graph = buildGraph(); graph.shutdown(); } } return; }
From source file:org.apache.mahout.classifier.sgd.TrainNewsGroups.java
public static void main(String[] args) throws IOException { File base = new File(args[0]); Multiset<String> overallCounts = HashMultiset.create(); int leakType = 0; if (args.length > 1) { leakType = Integer.parseInt(args[1]); }// ww w . j a v a 2s . c o m Dictionary newsGroups = new Dictionary(); NewsgroupHelper helper = new NewsgroupHelper(); helper.getEncoder().setProbes(2); AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(20, NewsgroupHelper.FEATURES, new L1()); learningAlgorithm.setInterval(800); learningAlgorithm.setAveragingWindow(500); List<File> files = Lists.newArrayList(); for (File newsgroup : base.listFiles()) { if (newsgroup.isDirectory()) { newsGroups.intern(newsgroup.getName()); files.addAll(Arrays.asList(newsgroup.listFiles())); } } Collections.shuffle(files); System.out.println(files.size() + " training files"); SGDInfo info = new SGDInfo(); int k = 0; for (File file : files) { String ng = file.getParentFile().getName(); int actual = newsGroups.intern(ng); Vector v = helper.encodeFeatureVector(file, actual, leakType, overallCounts); learningAlgorithm.train(actual, v); k++; State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest(); SGDHelper.analyzeState(info, leakType, k, best); } learningAlgorithm.close(); SGDHelper.dissect(leakType, newsGroups, learningAlgorithm, files, overallCounts); System.out.println("exiting main"); ModelSerializer.writeBinary("/tmp/news-group.model", learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0)); List<Integer> counts = Lists.newArrayList(); System.out.println("Word counts"); for (String count : overallCounts.elementSet()) { counts.add(overallCounts.count(count)); } Collections.sort(counts, Ordering.natural().reverse()); k = 0; for (Integer count : counts) { System.out.println(k + "\t" + count); k++; if (k > 1000) { break; } } }
From source file:com.memonews.mahout.sentiment.SentimentModelTrainer.java
public static void main(final String[] args) throws IOException { final File base = new File(args[0]); final String modelPath = args.length > 1 ? args[1] : "target/model"; final Multiset<String> overallCounts = HashMultiset.create(); final Dictionary newsGroups = new Dictionary(); final SentimentModelHelper helper = new SentimentModelHelper(); helper.getEncoder().setProbes(2);/*from w w w .j a va 2s . co m*/ final AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(2, SentimentModelHelper.FEATURES, new L1()); learningAlgorithm.setInterval(800); learningAlgorithm.setAveragingWindow(500); final List<File> files = Lists.newArrayList(); for (final File newsgroup : base.listFiles()) { if (newsgroup.isDirectory()) { newsGroups.intern(newsgroup.getName()); files.addAll(Arrays.asList(newsgroup.listFiles())); } } Collections.shuffle(files); System.out.printf("%d training files\n", files.size()); final SGDInfo info = new SGDInfo(); int k = 0; for (final File file : files) { final String ng = file.getParentFile().getName(); final int actual = newsGroups.intern(ng); final Vector v = helper.encodeFeatureVector(file, overallCounts); learningAlgorithm.train(actual, v); k++; final State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest(); SGDHelper.analyzeState(info, 0, k, best); } learningAlgorithm.close(); SGDHelper.dissect(0, newsGroups, learningAlgorithm, files, overallCounts); System.out.println("exiting main"); ModelSerializer.writeBinary(modelPath, learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0)); final List<Integer> counts = Lists.newArrayList(); System.out.printf("Word counts\n"); for (final String count : overallCounts.elementSet()) { counts.add(overallCounts.count(count)); } Collections.sort(counts, Ordering.natural().reverse()); k = 0; for (final Integer count : counts) { System.out.printf("%d\t%d\n", k, count); k++; if (k > 1000) { break; } } }
From source file:org.apache.ctakes.temporal.data.analysis.PrintInconsistentAnnotations.java
public static void main(String[] args) throws Exception { Options options = CliFactory.parseArguments(Options.class, args); int windowSize = 50; List<Integer> patientSets = options.getPatients().getList(); List<Integer> trainItems = THYMEData.getPatientSets(patientSets, THYMEData.TRAIN_REMAINDERS); List<File> files = THYMEData.getFilesFor(trainItems, options.getRawTextDirectory()); CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files); AggregateBuilder aggregateBuilder = new AggregateBuilder(); aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription()); aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(XMIReader.class, XMIReader.PARAM_XMI_DIRECTORY, options.getXMIDirectory())); int totalDocTimeRels = 0; int totalInconsistentDocTimeRels = 0; for (Iterator<JCas> casIter = new JCasIterator(reader, aggregateBuilder.createAggregate()); casIter .hasNext();) {// w w w . j a v a 2s . c om JCas jCas = casIter.next(); String text = jCas.getDocumentText(); JCas goldView = jCas.getView("GoldView"); // group events by their narrative container Multimap<Annotation, EventMention> containers = HashMultimap.create(); for (TemporalTextRelation relation : JCasUtil.select(goldView, TemporalTextRelation.class)) { if (relation.getCategory().equals("CONTAINS")) { Annotation arg1 = relation.getArg1().getArgument(); Annotation arg2 = relation.getArg2().getArgument(); if (arg2 instanceof EventMention) { EventMention event = (EventMention) arg2; containers.put(arg1, event); } } } // check each container for inconsistent DocTimeRels for (Annotation container : containers.keySet()) { Set<String> docTimeRels = Sets.newHashSet(); for (EventMention event : containers.get(container)) { docTimeRels.add(event.getEvent().getProperties().getDocTimeRel()); } totalDocTimeRels += docTimeRels.size(); boolean inconsistentDocTimeRels; if (container instanceof EventMention) { EventMention mention = ((EventMention) container); String containerDocTimeRel = mention.getEvent().getProperties().getDocTimeRel(); inconsistentDocTimeRels = false; for (String docTimeRel : docTimeRels) { if (docTimeRel.equals(containerDocTimeRel)) { continue; } if (containerDocTimeRel.equals("BEFORE/OVERLAP") && (docTimeRel.equals("BEFORE") || docTimeRel.equals("OVERLAP"))) { continue; } inconsistentDocTimeRels = true; break; } } else { if (docTimeRels.size() == 1) { inconsistentDocTimeRels = false; } else if (docTimeRels.contains("BEFORE/OVERLAP")) { inconsistentDocTimeRels = docTimeRels.size() == 1 && (docTimeRels.contains("BEFORE") || docTimeRels.contains("OVERLAP")); } else { inconsistentDocTimeRels = true; } } // if inconsistent: print events, DocTimeRels and surrounding context if (inconsistentDocTimeRels) { totalInconsistentDocTimeRels += docTimeRels.size(); List<Integer> offsets = Lists.newArrayList(); offsets.add(container.getBegin()); offsets.add(container.getEnd()); for (EventMention event : containers.get(container)) { offsets.add(event.getBegin()); offsets.add(event.getEnd()); } Collections.sort(offsets); int begin = Math.max(offsets.get(0) - windowSize, 0); int end = Math.min(offsets.get(offsets.size() - 1) + windowSize, text.length()); System.err.printf("Inconsistent DocTimeRels in %s, ...%s...\n", new File(ViewUriUtil.getURI(jCas)).getName(), text.substring(begin, end).replaceAll("([\r\n])[\r\n]+", "$1")); if (container instanceof EventMention) { System.err.printf("Container: \"%s\" (docTimeRel=%s)\n", container.getCoveredText(), ((EventMention) container).getEvent().getProperties().getDocTimeRel()); } else { System.err.printf("Container: \"%s\"\n", container.getCoveredText()); } Ordering<EventMention> byBegin = Ordering.natural() .onResultOf(new Function<EventMention, Integer>() { @Override public Integer apply(@Nullable EventMention event) { return event.getBegin(); } }); for (EventMention event : byBegin.sortedCopy(containers.get(container))) { System.err.printf("* \"%s\" (docTimeRel=%s)\n", event.getCoveredText(), event.getEvent().getProperties().getDocTimeRel()); } System.err.println(); } } } System.err.printf("Inconsistent DocTimeRels: %.1f%% (%d/%d)\n", 100.0 * totalInconsistentDocTimeRels / totalDocTimeRels, totalInconsistentDocTimeRels, totalDocTimeRels); }
From source file:org.apache.ctakes.temporal.eval.EvaluationOfTimeSpans.java
public static void main(String[] args) throws Exception { Options options = CliFactory.parseArguments(Options.class, args); List<Integer> trainItems = null; List<Integer> devItems = null; List<Integer> testItems = null; List<Integer> patientSets = options.getPatients().getList(); if (options.getXMLFormat() == XMLFormat.I2B2) { trainItems = I2B2Data.getTrainPatientSets(options.getXMLDirectory()); devItems = I2B2Data.getDevPatientSets(options.getXMLDirectory()); testItems = I2B2Data.getTestPatientSets(options.getXMLDirectory()); } else {// w w w . j av a2s. c om trainItems = THYMEData.getPatientSets(patientSets, options.getTrainRemainders().getList()); devItems = THYMEData.getPatientSets(patientSets, options.getDevRemainders().getList()); testItems = THYMEData.getPatientSets(patientSets, options.getTestRemainders().getList()); } List<Integer> allTrain = new ArrayList<>(trainItems); List<Integer> allTest = null; if (options.getTest()) { allTrain.addAll(devItems); allTest = new ArrayList<>(testItems); } else { allTest = new ArrayList<>(devItems); } // specify the annotator classes to use List<Class<? extends JCasAnnotator_ImplBase>> annotatorClasses = Lists.newArrayList(); if (options.getRunBackwards()) annotatorClasses.add(BackwardsTimeAnnotator.class); if (options.getRunForwards()) annotatorClasses.add(TimeAnnotator.class); if (options.getRunParserBased()) annotatorClasses.add(ConstituencyBasedTimeAnnotator.class); if (options.getRunCrfBased()) annotatorClasses.add(CRFTimeAnnotator.class); if (annotatorClasses.size() == 0) { // run all annotatorClasses.add(BackwardsTimeAnnotator.class); annotatorClasses.add(TimeAnnotator.class); annotatorClasses.add(ConstituencyBasedTimeAnnotator.class); annotatorClasses.add(CRFTimeAnnotator.class); } Map<Class<? extends JCasAnnotator_ImplBase>, String[]> annotatorTrainingArguments = Maps.newHashMap(); // THYME best params: Backwards: 0.1, CRF 0.3, Time 0.1, Constituency 0.3 // i2b2 best params: Backwards 0.1, CRF 3.0, Time 0.1, Constituency 0.3 // String gridParam = "0.01"; annotatorTrainingArguments.put(BackwardsTimeAnnotator.class, new String[] { "-c", "0.1" }); annotatorTrainingArguments.put(TimeAnnotator.class, new String[] { "-c", "0.1" }); annotatorTrainingArguments.put(ConstituencyBasedTimeAnnotator.class, new String[] { "-c", "0.3" }); annotatorTrainingArguments.put(CRFTimeAnnotator.class, new String[] { "-p", "c2=" + "0.3" }); // run one evaluation per annotator class final Map<Class<?>, AnnotationStatistics<?>> annotatorStats = Maps.newHashMap(); for (Class<? extends JCasAnnotator_ImplBase> annotatorClass : annotatorClasses) { EvaluationOfTimeSpans evaluation = new EvaluationOfTimeSpans(new File("target/eval/time-spans"), options.getRawTextDirectory(), options.getXMLDirectory(), options.getXMLFormat(), options.getSubcorpus(), options.getXMIDirectory(), options.getTreebankDirectory(), options.getFeatureSelectionThreshold(), options.getSMOTENeighborNumber(), annotatorClass, options.getPrintOverlappingSpans(), annotatorTrainingArguments.get(annotatorClass)); evaluation.prepareXMIsFor(patientSets); evaluation.setSkipTrain(options.getSkipTrain()); evaluation.printErrors = options.getPrintErrors(); if (options.getI2B2Output() != null) evaluation.setI2B2Output(options.getI2B2Output() + "/" + annotatorClass.getSimpleName()); String name = String.format("%s.errors", annotatorClass.getSimpleName()); evaluation.setLogging(Level.FINE, new File("target/eval", name)); AnnotationStatistics<String> stats = evaluation.trainAndTest(allTrain, allTest); annotatorStats.put(annotatorClass, stats); } // allow ordering of models by F1 Ordering<Class<? extends JCasAnnotator_ImplBase>> byF1 = Ordering.natural() .onResultOf(new Function<Class<? extends JCasAnnotator_ImplBase>, Double>() { @Override public Double apply(Class<? extends JCasAnnotator_ImplBase> annotatorClass) { return annotatorStats.get(annotatorClass).f1(); } }); // print out models, ordered by F1 for (Class<?> annotatorClass : byF1.sortedCopy(annotatorClasses)) { System.err.printf("===== %s =====\n", annotatorClass.getSimpleName()); System.err.println(annotatorStats.get(annotatorClass)); } }
From source file:com.yahoo.spaclu.data.index.IndexFeatureValueSpark.java
public static void main(String[] args) throws IOException { IndexFeatureValueOptions optionsFormatRawToDatabase = new IndexFeatureValueOptions(args); String inputPathString = optionsFormatRawToDatabase.getInputPath(); String outputPathString = optionsFormatRawToDatabase.getOutputPath(); String indexPathString = optionsFormatRawToDatabase.getIndexPath(); int numberOfPartitions = optionsFormatRawToDatabase.getNumberOfPartitions(); int maxCutoffThreshold = optionsFormatRawToDatabase.getMaximumCutoffThreshold(); int minCutoffThreshold = optionsFormatRawToDatabase.getMinimumCutoffThreshold(); /*/*from w ww . ja va 2 s . c om*/ * Set<String> excludingFeatureNames = new HashSet<String>(); * excludingFeatureNames.add("login"); * excludingFeatureNames.add("time"); excludingFeatureNames.add("day"); * excludingFeatureNames.add("hms"); excludingFeatureNames.add("fail"); */ sLogger.info("Tool: " + IndexFeatureValueSpark.class.getSimpleName()); sLogger.info(" - input path: " + inputPathString); sLogger.info(" - output path: " + outputPathString); sLogger.info(" - index path: " + indexPathString); sLogger.info(" - number of partitions: " + numberOfPartitions); sLogger.info(" - maximum cutoff: " + maxCutoffThreshold); sLogger.info(" - minimum cutoff: " + minCutoffThreshold); // Create a default hadoop configuration Configuration conf = new Configuration(); // Parse created config to the HDFS FileSystem fs = FileSystem.get(conf); Path outputPath = new Path(outputPathString); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } SparkConf sparkConf = new SparkConf().setAppName(optionsFormatRawToDatabase.toString()); JavaSparkContext sc = new JavaSparkContext(sparkConf); Map<Integer, String> featureIndices = getFeatureIndices(sc.textFile(indexPathString)); List<Integer> listOfAllFeatureIndices = new LinkedList<Integer>(); List<String> listOfAllFeatureInfo = new LinkedList<String>(); Iterator<Integer> indexIter = featureIndices.keySet().iterator(); while (indexIter.hasNext()) { Integer tempKey = indexIter.next(); listOfAllFeatureIndices.add(tempKey); listOfAllFeatureInfo.add(featureIndices.get(tempKey)); } /* * * * * * * * */ JavaRDD<String> rawLines = sc.textFile(inputPathString).repartition(numberOfPartitions); JavaRDD<String[]> tokenizedLines = rawLines.map(new LineFilter(listOfAllFeatureIndices)); JavaPairRDD<Entry<Integer, String>, Long> featureValuesCounts = tokenizedLines .flatMapToPair(new FeatureValueMapper()).reduceByKey(new FeatureValueReducer()); Map<Integer, Builder<String, Long>> featureValueMapping = new Hashtable<Integer, Builder<String, Long>>(); Iterator<Tuple2<Entry<Integer, String>, Long>> iter = featureValuesCounts.collect().iterator(); while (iter.hasNext()) { Tuple2<Entry<Integer, String>, Long> temp = iter.next(); Entry<Integer, String> featureValueEntry = temp._1; int featureIndex = featureValueEntry.getKey(); String featureValue = featureValueEntry.getValue(); long featureValueCount = temp._2; if (!featureValueMapping.containsKey(featureIndex)) { Builder<String, Long> mapBuilder = new Builder<String, Long>(Ordering.natural()); featureValueMapping.put(featureIndex, mapBuilder); } featureValueMapping.get(featureIndex).put(featureValue, featureValueCount); } Preconditions.checkArgument(featureValueMapping.size() == listOfAllFeatureIndices.size()); String outputFeaturePathString = outputPathString + "feature" + Settings.SEPERATOR; fs.mkdirs(new Path(outputFeaturePathString)); String outputFeatureNamePathString = outputPathString + "feature.dat"; Path outputFeatureNamePath = new Path(outputFeatureNamePathString); PrintWriter featureNamePrinterWriter = new PrintWriter(fs.create(outputFeatureNamePath), true); List<Integer> listOfFeatureIndicesToKeep = new LinkedList<Integer>(); Map<Integer, Map<String, Integer>> featureValueIndex = new Hashtable<Integer, Map<String, Integer>>(); for (int d = 0; d < featureValueMapping.size(); d++) { Map<String, Integer> valueToIndex = new Hashtable<String, Integer>(); Map<Integer, String> indexToValue = new Hashtable<Integer, String>(); ImmutableSortedMap<String, Long> immutableSortedMap = featureValueMapping.get(d).build(); for (String keyString : immutableSortedMap.keySet()) { valueToIndex.put(keyString, valueToIndex.size()); indexToValue.put(indexToValue.size(), keyString); } if (valueToIndex.size() <= minCutoffThreshold || valueToIndex.size() > maxCutoffThreshold) { sLogger.info("Feature (" + listOfAllFeatureInfo.get(d) + ") contains " + valueToIndex.size() + " values, skip..."); continue; } else { sLogger.info("Feature (" + listOfAllFeatureInfo.get(d) + ") contains " + valueToIndex.size() + " values."); listOfFeatureIndicesToKeep.add(listOfAllFeatureIndices.get(d)); featureNamePrinterWriter.println(listOfAllFeatureInfo.get(d)); } String outputFeatureIndexPathString = outputFeaturePathString + "index" + Settings.UNDER_SCORE + featureValueIndex.size() + ".dat"; Path outputIndexPath = new Path(outputFeatureIndexPathString); featureValueIndex.put(featureValueIndex.size(), valueToIndex); PrintWriter featureValueIndexPrinterWriter = new PrintWriter(fs.create(outputIndexPath), true); for (int i = 0; i < indexToValue.size(); i++) { featureValueIndexPrinterWriter.println("" + i + Settings.TAB + indexToValue.get(i) + Settings.TAB + immutableSortedMap.get(indexToValue.get(i))); } featureValueIndexPrinterWriter.close(); } featureNamePrinterWriter.close(); JavaRDD<String[]> filteredLines = rawLines.map(new LineFilter(listOfFeatureIndicesToKeep)); JavaRDD<FeatureIntegerVector> indexedData = filteredLines.map(new FeatureValueIndexer(featureValueIndex)); String outputDataPathString = outputPathString + "data"; Path outputDataPath = new Path(outputDataPathString); if (fs.exists(outputDataPath)) { fs.delete(outputDataPath, true); } indexedData.saveAsTextFile(outputDataPathString); sc.stop(); }
From source file:org.caleydo.view.domino.internal.util.IndexedSort.java
public static <T extends Comparable<T>> int[] sortIndex(List<T> list) { return sortIndex(list, Ordering.natural()); }