extractor.Extractor.java Source code

Java tutorial

Introduction

Here is the source code for extractor.Extractor.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package extractor;

import classifierdoc.Classifierdoc;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

/**
 *
 * @author Lucas
 */
public class Extractor {

    public static ArrayList<Document> returnDocuments(String pathBase, String[] files) {

        ArrayList<Document> documents = new ArrayList<>();

        for (String file : files) {
            PDDocument pdDocument = null;
            String paperString = null;
            try {
                pdDocument = PDDocument.load(new File(pathBase + file));
                paperString = new PDFTextStripper().getText(pdDocument);
                pdDocument.close();
                Document document = new Document(paperString);
                documents.add(document);

            } catch (FileNotFoundException ex) {
                System.out.println("Arquivo no encontrado! Detalhes: " + ex.getLocalizedMessage());
                continue;
            } catch (IOException ex) {
                Logger.getLogger(Classifierdoc.class.getName()).log(Level.SEVERE, null, ex);

            }
        }

        return documents;
    }

}