Example usage for org.apache.commons.io Charsets UTF_8

List of usage examples for org.apache.commons.io Charsets UTF_8

Introduction

In this page you can find the example usage for org.apache.commons.io Charsets UTF_8.

Prototype

Charset UTF_8

To view the source code for org.apache.commons.io Charsets UTF_8.

Click Source Link

Document

Eight-bit Unicode Transformation Format.

Usage

From source file:org.apache.mahout.utils.email.MailProcessorTest.java

@Test
public void testStripQuoted() throws Exception {
    StringWriter writer = new StringWriter();
    MailOptions options = new MailOptions();
    options.setSeparator(":::");
    options.setCharset(Charsets.UTF_8);
    options.setPatternsToMatch(new Pattern[] { MailProcessor.SUBJECT_PREFIX });
    options.setInput(new File(System.getProperty("user.dir")));
    options.setIncludeBody(true);//from   www. j a  va  2 s . c  o  m
    MailProcessor proc = new MailProcessor(options, "", writer);
    URL url = MailProcessorTest.class.getClassLoader().getResource("test.mbox");
    File file = new File(url.toURI());
    long count = proc.parseMboxLineByLine(file);
    assertEquals(7, count);
    assertTrue(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering"));
    writer = new StringWriter();
    proc = new MailProcessor(options, "", writer);
    options.setStripQuotedText(true);
    count = proc.parseMboxLineByLine(file);
    assertEquals(7, count);
    assertFalse(writer.getBuffer().toString().contains("> Cocoon Cron Block Configurable Clustering"));

}

From source file:org.apache.mahout.utils.MatrixDumper.java

private static PrintStream getPrintStream(String outputPath) throws IOException {
    if (outputPath == null) {
        return System.out;
    }// w w w  .j a v a 2 s  . c  o m
    File outputFile = new File(outputPath);
    if (outputFile.exists()) {
        outputFile.delete();
    }
    outputFile.createNewFile();
    OutputStream os = new FileOutputStream(outputFile);
    return new PrintStream(os, false, Charsets.UTF_8.displayName());
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilter.java

/** 
 * @param filter tokens will be checked for membership in this bloom filter
 * @param in the tokenstream to read.//  w  ww .  ja  v a 2 s .c o  m
 * @param keepMembers keep memoers of the bloom filter? If true works like
 *   a whitelist and members found in the list are kept and all others are
 *   dropped. If false works like a stoplist and members found in the 
 *   filter are dropped all others are kept.
 */
public BloomTokenFilter(Filter filter, boolean keepMembers, TokenStream in) {
    super(in);
    this.filter = filter;
    this.keepMembers = keepMembers;
    this.key = new Key();
    this.termAtt = addAttribute(CharTermAttribute.class);
    this.encoder = Charsets.UTF_8.newEncoder().onMalformedInput(CodingErrorAction.REPORT)
            .onUnmappableCharacter(CodingErrorAction.REPORT);
}

From source file:org.apache.mahout.utils.SequenceFileDumper.java

@Override
public int run(String[] args) throws Exception {

    addInputOption();// ww  w  . j a va2 s  . co  m
    addOutputOption();
    addOption("substring", "b", "The number of chars to print out per value", false);
    addOption(buildOption("count", "c", "Report the count only", false, false, null));
    addOption("numItems", "n", "Output at most <n> key value pairs", false);
    addOption(
            buildOption("facets", "fa", "Output the counts per key.  Note, if there are a lot of unique keys, "
                    + "this can take up a fair amount of memory", false, false, null));
    addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    Path input = getInputPath();
    FileSystem fs = input.getFileSystem(conf);
    if (fs.getFileStatus(input).isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
    } else {
        pathArr = new Path[1];
        pathArr[0] = input;
    }

    Writer writer;
    boolean shouldClose;
    if (hasOption("output")) {
        shouldClose = true;
        writer = Files.newWriter(new File(getOption("output")), Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        for (Path path : pathArr) {
            if (!hasOption("quiet")) {
                writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
            }

            int sub = Integer.MAX_VALUE;
            if (hasOption("substring")) {
                sub = Integer.parseInt(getOption("substring"));
            }
            boolean countOnly = hasOption("count");
            SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<>(path, true, conf);
            if (!hasOption("quiet")) {
                writer.append("Key class: ").append(iterator.getKeyClass().toString());
                writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n');
            }
            OpenObjectIntHashMap<String> facets = null;
            if (hasOption("facets")) {
                facets = new OpenObjectIntHashMap<>();
            }
            long count = 0;
            if (countOnly) {
                while (iterator.hasNext()) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                writer.append("Count: ").append(String.valueOf(count)).append('\n');
            } else {
                long numItems = Long.MAX_VALUE;
                if (hasOption("numItems")) {
                    numItems = Long.parseLong(getOption("numItems"));
                    if (!hasOption("quiet")) {
                        writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n");
                    }
                }
                while (iterator.hasNext() && count < numItems) {
                    Pair<?, ?> record = iterator.next();
                    String key = record.getFirst().toString();
                    writer.append("Key: ").append(key);
                    String str = record.getSecond().toString();
                    writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str);
                    writer.write('\n');
                    if (facets != null) {
                        facets.adjustOrPutValue(key, 1, 1); //either insert or add 1
                    }
                    count++;
                }
                if (!hasOption("quiet")) {
                    writer.append("Count: ").append(String.valueOf(count)).append('\n');
                }
            }
            if (facets != null) {
                List<String> keyList = new ArrayList<>(facets.size());

                IntArrayList valueList = new IntArrayList(facets.size());
                facets.pairsSortedByKey(keyList, valueList);
                writer.append("-----Facets---\n");
                writer.append("Key\t\tCount\n");
                int i = 0;
                for (String key : keyList) {
                    writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n');
                }
            }
        }
        writer.flush();

    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:org.apache.mahout.utils.SplitInputTest.java

@Override
@Before//from w  w w.  jav  a 2 s .  co m
public void setUp() throws Exception {
    Configuration conf = getConfiguration();
    fs = FileSystem.get(conf);

    super.setUp();

    countMap = new OpenObjectIntHashMap<>();

    charset = Charsets.UTF_8;
    tempSequenceDirectory = getTestTempFilePath("tmpsequence");
    tempInputFile = getTestTempFilePath("bayesinputfile");
    tempTrainingDirectory = getTestTempDirPath("bayestrain");
    tempTestDirectory = getTestTempDirPath("bayestest");
    tempMapRedOutputDirectory = new Path(getTestTempDirPath(), "mapRedOutput");
    tempInputDirectory = getTestTempDirPath("bayesinputdir");

    si = new SplitInput();
    si.setTrainingOutputDirectory(tempTrainingDirectory);
    si.setTestOutputDirectory(tempTestDirectory);
    si.setInputDirectory(tempInputDirectory);
}

From source file:org.apache.mahout.utils.SplitInputTest.java

private void writeMultipleInputFiles() throws IOException {
    Writer writer = null;/*from w  w w .  j a  va2  s .c  o m*/
    String currentLabel = null;
    try {
        for (String[] entry : ClassifierData.DATA) {
            if (!entry[0].equals(currentLabel)) {
                currentLabel = entry[0];
                Closeables.close(writer, false);

                writer = new BufferedWriter(new OutputStreamWriter(
                        fs.create(new Path(tempInputDirectory, currentLabel)), Charsets.UTF_8));
            }
            countMap.adjustOrPutValue(currentLabel, 1, 1);
            writer.write(currentLabel + '\t' + entry[1] + '\n');
        }
    } finally {
        Closeables.close(writer, false);
    }
}

From source file:org.apache.mahout.utils.SplitInputTest.java

private void writeSingleInputFile() throws IOException {
    Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(tempInputFile), Charsets.UTF_8));
    try {/* w ww .  jav a  2 s .co  m*/
        for (String[] entry : ClassifierData.DATA) {
            writer.write(entry[0] + '\t' + entry[1] + '\n');
        }
    } finally {
        Closeables.close(writer, true);
    }
}

From source file:org.apache.mahout.utils.vectors.arff.ARFFVectorIterable.java

public ARFFVectorIterable(File file, ARFFModel model) throws IOException {
    this(file, Charsets.UTF_8, model);
}

From source file:org.apache.mahout.utils.vectors.arff.ARFFVectorIterableTest.java

static ARFFVectorIterable getVectors(String resourceName, ARFFModel model) throws IOException {
    String sample = Resources.toString(Resources.getResource(resourceName), Charsets.UTF_8);
    return new ARFFVectorIterable(sample, model);
}

From source file:org.apache.mahout.utils.vectors.arff.DriverTest.java

@Test
public void dictionary() throws IOException {

    ARFFModel model = new MapBackedARFFModel();
    ARFFVectorIterableTest.getVectors("sample-dense.arff", model);
    StringWriter writer = new StringWriter();
    Driver.writeLabelBindings(writer, model, ",");
    String expected1 = Resources.toString(Resources.getResource("expected-arff-dictionary.csv"),
            Charsets.UTF_8);
    String expected2 = Resources.toString(Resources.getResource("expected-arff-dictionary-2.csv"),
            Charsets.UTF_8);/*from  www .j  a  v  a2  s. c  o m*/
    assertTrue(expected1.equals(writer.toString()) || expected2.equals(writer.toString()));
}