opennlp.tools.formats.pan.pan16.PAN16Reader.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.tools.formats.pan.pan16.PAN16Reader.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats.pan.pan16;

import org.apache.commons.io.FilenameUtils;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.Unmarshaller;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.security.InvalidParameterException;
import java.util.ArrayList;
import java.util.HashMap;

/**
 * Created by anthony on 4/20/16.
 */
public class PAN16Reader {

    private static final String dataFolder = "/datasets/pan16-author-profiling-training-dataset-english-2016-02-29/";

    public static ArrayList<PAN16Author> getPAN16Authors(int startAuthorIndex, int endAuthorIndex) {
        ArrayList<PAN16Author> authors = new ArrayList<PAN16Author>();

        try {
            HashMap<String, String[]> truthMap = PAN16Reader.getTruthMap();

            JAXBContext jaxbContext = JAXBContext.newInstance(PAN16Author.class);
            Unmarshaller jaxbUnmarshaller = jaxbContext.createUnmarshaller();

            File folder = new File(PAN16Reader.class.getResource(dataFolder).getPath());
            File[] listOfFiles = folder.listFiles();

            if (startAuthorIndex < 0 || endAuthorIndex >= listOfFiles.length) {
                throw new IndexOutOfBoundsException("an author index is out of bounds, should be in : 0-"
                        + (listOfFiles.length - 2) + " range");
            } else if (startAuthorIndex > endAuthorIndex) {
                throw new InvalidParameterException("the start index is larger than the end index");
            }

            for (int i = startAuthorIndex; i <= endAuthorIndex; i++) {
                File file = listOfFiles[i];
                if (file.isFile() && FilenameUtils.getExtension(file.getPath()).equals("xml")) {
                    PAN16Author author = (PAN16Author) jaxbUnmarshaller.unmarshal(new File(file.getPath()));
                    String truths[] = truthMap.get(FilenameUtils.getBaseName(file.getPath()));
                    author.setGender(truths[0]);
                    author.setAge_group(truths[1]);
                    authors.add(author);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return authors;
    }

    public static HashMap<String, String[]> getTruthMap() {

        HashMap<String, String[]> truthMap = new HashMap<String, String[]>();
        try {
            FileReader fr = new FileReader(PAN16Reader.class.getResource(dataFolder).getPath() + "truth.txt");
            BufferedReader in = new BufferedReader(fr);
            String line;
            while ((line = in.readLine()) != null) {
                String tokens[] = line.split(":::");
                String[] truth = new String[2];
                truth[0] = tokens[1];
                truth[1] = tokens[2];
                truthMap.put(tokens[0], truth);
            }
            return truthMap;
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
}