List of usage examples for org.apache.commons.io Charsets UTF_8
Charset UTF_8
To view the source code for org.apache.commons.io Charsets UTF_8.
Click Source Link
Eight-bit Unicode Transformation Format.
From source file:org.apache.mahout.utils.email.MailProcessorTest.java
@Test public void testStripQuoted() throws Exception { StringWriter writer = new StringWriter(); MailOptions options = new MailOptions(); options.setSeparator(":::"); options.setCharset(Charsets.UTF_8); options.setPatternsToMatch(new Pattern[] { MailProcessor.SUBJECT_PREFIX }); options.setInput(new File(System.getProperty("user.dir"))); options.setIncludeBody(true);//from www. j a va 2 s . c o m MailProcessor proc = new MailProcessor(options, "", writer); URL url = MailProcessorTest.class.getClassLoader().getResource("test.mbox"); File file = new File(url.toURI()); long count = proc.parseMboxLineByLine(file); assertEquals(7, count); assertTrue(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering")); writer = new StringWriter(); proc = new MailProcessor(options, "", writer); options.setStripQuotedText(true); count = proc.parseMboxLineByLine(file); assertEquals(7, count); assertFalse(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering")); }
From source file:org.apache.mahout.utils.MatrixDumper.java
private static PrintStream getPrintStream(String outputPath) throws IOException { if (outputPath == null) { return System.out; }// w w w .j a v a 2 s . c o m File outputFile = new File(outputPath); if (outputFile.exists()) { outputFile.delete(); } outputFile.createNewFile(); OutputStream os = new FileOutputStream(outputFile); return new PrintStream(os, false, Charsets.UTF_8.displayName()); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilter.java
/** * @param filter tokens will be checked for membership in this bloom filter * @param in the tokenstream to read.// w ww . ja v a 2 s .c o m * @param keepMembers keep memoers of the bloom filter? If true works like * a whitelist and members found in the list are kept and all others are * dropped. If false works like a stoplist and members found in the * filter are dropped all others are kept. */ public BloomTokenFilter(Filter filter, boolean keepMembers, TokenStream in) { super(in); this.filter = filter; this.keepMembers = keepMembers; this.key = new Key(); this.termAtt = addAttribute(CharTermAttribute.class); this.encoder = Charsets.UTF_8.newEncoder().onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); }
From source file:org.apache.mahout.utils.SequenceFileDumper.java
@Override public int run(String[] args) throws Exception { addInputOption();// ww w . j a va2 s . co m addOutputOption(); addOption("substring", "b", "The number of chars to print out per value", false); addOption(buildOption("count", "c", "Report the count only", false, false, null)); addOption("numItems", "n", "Output at most <n> key value pairs", false); addOption( buildOption("facets", "fa", "Output the counts per key. Note, if there are a lot of unique keys, " + "this can take up a fair amount of memory", false, false, null)); addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null)); if (parseArguments(args, false, true) == null) { return -1; } Path[] pathArr; Configuration conf = new Configuration(); Path input = getInputPath(); FileSystem fs = input.getFileSystem(conf); if (fs.getFileStatus(input).isDir()) { pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter())); } else { pathArr = new Path[1]; pathArr[0] = input; } Writer writer; boolean shouldClose; if (hasOption("output")) { shouldClose = true; writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8); } else { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } try { for (Path path : pathArr) { if (!hasOption("quiet")) { writer.append("Input Path: ").append(String.valueOf(path)).append('\n'); } int sub = Integer.MAX_VALUE; if (hasOption("substring")) { sub = Integer.parseInt(getOption("substring")); } boolean countOnly = hasOption("count"); SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<>(path, true, conf); if (!hasOption("quiet")) { writer.append("Key class: ").append(iterator.getKeyClass().toString()); writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n'); } OpenObjectIntHashMap<String> facets = null; if (hasOption("facets")) { facets = new OpenObjectIntHashMap<>(); } long count = 0; if (countOnly) { while (iterator.hasNext()) { Pair<?, ?> record = iterator.next(); String key = record.getFirst().toString(); if (facets != null) { facets.adjustOrPutValue(key, 1, 1); //either insert or add 1 } count++; } writer.append("Count: ").append(String.valueOf(count)).append('\n'); } else { long numItems = Long.MAX_VALUE; if (hasOption("numItems")) { numItems = Long.parseLong(getOption("numItems")); if (!hasOption("quiet")) { writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n"); } } while (iterator.hasNext() && count < numItems) { Pair<?, ?> record = iterator.next(); String key = record.getFirst().toString(); writer.append("Key: ").append(key); String str = record.getSecond().toString(); writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str); writer.write('\n'); if (facets != null) { facets.adjustOrPutValue(key, 1, 1); //either insert or add 1 } count++; } if (!hasOption("quiet")) { writer.append("Count: ").append(String.valueOf(count)).append('\n'); } } if (facets != null) { List<String> keyList = new ArrayList<>(facets.size()); IntArrayList valueList = new IntArrayList(facets.size()); facets.pairsSortedByKey(keyList, valueList); writer.append("-----Facets---\n"); writer.append("Key\t\tCount\n"); int i = 0; for (String key : keyList) { writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n'); } } } writer.flush(); } finally { if (shouldClose) { Closeables.close(writer, false); } } return 0; }
From source file:org.apache.mahout.utils.SplitInputTest.java
@Override @Before//from w w w. jav a 2 s . co m public void setUp() throws Exception { Configuration conf = getConfiguration(); fs = FileSystem.get(conf); super.setUp(); countMap = new OpenObjectIntHashMap<>(); charset = Charsets.UTF_8; tempSequenceDirectory = getTestTempFilePath("tmpsequence"); tempInputFile = getTestTempFilePath("bayesinputfile"); tempTrainingDirectory = getTestTempDirPath("bayestrain"); tempTestDirectory = getTestTempDirPath("bayestest"); tempMapRedOutputDirectory = new Path(getTestTempDirPath(), "mapRedOutput"); tempInputDirectory = getTestTempDirPath("bayesinputdir"); si = new SplitInput(); si.setTrainingOutputDirectory(tempTrainingDirectory); si.setTestOutputDirectory(tempTestDirectory); si.setInputDirectory(tempInputDirectory); }
From source file:org.apache.mahout.utils.SplitInputTest.java
private void writeMultipleInputFiles() throws IOException { Writer writer = null;/*from w w w . j a va2 s .c o m*/ String currentLabel = null; try { for (String[] entry : ClassifierData.DATA) { if (!entry[0].equals(currentLabel)) { currentLabel = entry[0]; Closeables.close(writer, false); writer = new BufferedWriter(new OutputStreamWriter( fs.create(new Path(tempInputDirectory, currentLabel)), Charsets.UTF_8)); } countMap.adjustOrPutValue(currentLabel, 1, 1); writer.write(currentLabel + '\t' + entry[1] + '\n'); } } finally { Closeables.close(writer, false); } }
From source file:org.apache.mahout.utils.SplitInputTest.java
private void writeSingleInputFile() throws IOException { Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(tempInputFile), Charsets.UTF_8)); try {/* w ww . jav a 2 s .co m*/ for (String[] entry : ClassifierData.DATA) { writer.write(entry[0] + '\t' + entry[1] + '\n'); } } finally { Closeables.close(writer, true); } }
From source file:org.apache.mahout.utils.vectors.arff.ARFFVectorIterable.java
public ARFFVectorIterable(File file, ARFFModel model) throws IOException { this(file, Charsets.UTF_8, model); }
From source file:org.apache.mahout.utils.vectors.arff.ARFFVectorIterableTest.java
static ARFFVectorIterable getVectors(String resourceName, ARFFModel model) throws IOException { String sample = Resources.toString(Resources.getResource(resourceName), Charsets.UTF_8); return new ARFFVectorIterable(sample, model); }
From source file:org.apache.mahout.utils.vectors.arff.DriverTest.java
@Test public void dictionary() throws IOException { ARFFModel model = new MapBackedARFFModel(); ARFFVectorIterableTest.getVectors("sample-dense.arff", model); StringWriter writer = new StringWriter(); Driver.writeLabelBindings(writer, model, ","); String expected1 = Resources.toString(Resources.getResource("expected-arff-dictionary.csv"), Charsets.UTF_8); String expected2 = Resources.toString(Resources.getResource("expected-arff-dictionary-2.csv"), Charsets.UTF_8);/*from www .j a v a2 s. c o m*/ assertTrue(expected1.equals(writer.toString()) || expected2.equals(writer.toString())); }