Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package pln; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; /** * * @author shaolin */ public class Pln { /** * @param args the command line arguments */ public static HashMap<String, Integer> allSW = new HashMap<>(); static String pdftoText(String fileName) { PDFParser parser; String parsedText = null; ; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File(fileName); if (!file.isFile()) { System.err.println("File " + fileName + " does not exist."); return null; } try { parser = new PDFParser(new FileInputStream(file)); } catch (IOException e) { System.err.println("Unable to open PDF Parser. " + e.getMessage()); return null; } try { parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(pdDoc.getNumberOfPages()); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { System.err.println("An exception occured in parsing the PDF Document." + e.getMessage()); } finally { try { if (cosDoc != null) { cosDoc.close(); } if (pdDoc != null) { pdDoc.close(); } } catch (Exception e) { e.printStackTrace(); } } return parsedText; } public static HashMap<String, Integer> countWord(String text) { HashMap<String, Integer> biblioteca = new HashMap<>(); text = text.toLowerCase(); text = text.replaceAll("[^a-z0-9\\s]", " "); text = text.replaceAll("[\\n]", " "); text = text.replaceAll("[\\s][\\s]*", " "); String[] palavras = text.split(" "); for (String p : palavras) { int quant = 1; if (biblioteca.containsKey(p)) { quant = biblioteca.get(p); quant++; } biblioteca.put(p, quant); quant = 1; if (allSW.containsKey(p)) { quant = allSW.get(p); quant++; } allSW.put(p, quant); } // for (String p: biblioteca.keySet()){ // System.out.println(p+" -> "+biblioteca.get(p)); // } return biblioteca; } public static HashMap<String, Integer> removeSmall(HashMap<String, Integer> biblioteca) { ArrayList<String> small = new ArrayList<>(); for (String p : biblioteca.keySet()) { if (p.length() < 3) { small.add(p); } } for (String p : small) { biblioteca.remove(p); } return biblioteca; } public static ArrayList<Palavra> ordWord(HashMap<String, Integer> biblioteca) { String palavra_idx = ""; ArrayList<Palavra> sorted = new ArrayList<>(); while (!biblioteca.isEmpty()) { int maior = 0; for (String p : biblioteca.keySet()) { if (biblioteca.get(p) > maior) { maior = biblioteca.get(p); palavra_idx = p; } } Palavra w = new Palavra(palavra_idx, maior); sorted.add(w); biblioteca.remove(palavra_idx); } return sorted; } public static ArrayList<Palavra> top10(ArrayList<Palavra> sorted) { ArrayList<Palavra> top10 = new ArrayList<>(); for (int i = 0; i < 10; i++) { top10.add(sorted.get(i)); } return top10; } public static String removeSW(ArrayList<String> stopWords, String text) { for (String sw : stopWords) { // System.out.println(" removeu "+sw); text = text.replaceAll("\\b" + sw + "\\b", ""); } return text; } public static Article removeRefs(String text) { String word = "References"; String refs = null; int indice = 0; indice = text.lastIndexOf(word); if (indice == -1) { word = "Reference"; indice = text.lastIndexOf(word); if (indice == -1) { word = "R E F E R E N C E S"; indice = text.lastIndexOf(word); } } if (indice == -1) { refs = "no refs"; } else { refs = text.substring(indice); text = text.substring(0, indice); } Article art = new Article(text); art.setRefe(refs); return art; } public static ArrayList<String> getStatements(String pdf) { ArrayList<String> st = new ArrayList(); int start = 0; for (int i = 0; i < pdf.length(); i++) { if (pdf.charAt(i) == '.') { st.add(pdf.substring(start, i)); start = i + 1; } } st.add(pdf.substring(start, pdf.length() - 1)); return st; } public static ArrayList<String> findAuthors(String txt) { String namePattern = "([A-Z][a-z]+)((\\s*)(([A-Z](\\.|[a-z]*))|(de|o)\\s[A-Z](\\.|[a-z]*)))+"; String pattern = "\\A" + namePattern + "((\\s*)(and|,)(\\s*)" + namePattern + ")*(\\s*)\\z"; String pdf[] = txt.split("\n"); ArrayList<String> r = new ArrayList<>(); Pattern p = Pattern.compile(pattern); for (String s : pdf) { s = s.replaceAll("[^A-Za-z0-9\\.,]", " ").replaceAll("\\s+", " "); // System.out.println(s); Matcher match = p.matcher(s); while (match.find() && match.end() - match.start() > 8) { // System.out.println("------------------------"); // System.out.println(s); r.add(s); } } return r; } public static ArrayList<String> findAddress(ArrayList<String> pdf) { String universityPattern = "(.*)[Uu]niversity(.*)"; String departmentPattern = "(.*)[Dd]epartment(.*)"; String corporationPattern = "(.*)[Cc]orporation(.*)"; ArrayList<String> r = new ArrayList<>(); String pattern = "(" + universityPattern + "|" + departmentPattern + "|" + corporationPattern + ")"; Pattern p = Pattern.compile(pattern); for (String s : pdf) { // System.out.println(s); Matcher match = p.matcher(s); if (match.find()) { // System.out.println("------------------------"); r.add(s); // System.out.println(s); } } return r; } public static ArrayList<String> findObjective(ArrayList<String> pdf) { String patternOne = "(.*)[Pp]urpose(.*)"; String patternTwo = "(.*)[Oo]bjective(.*)"; ArrayList<String> r = new ArrayList<>(); String pattern = "(" + patternOne + "|" + patternTwo + ")"; Pattern p = Pattern.compile(pattern); for (String s : pdf) { Matcher match = p.matcher(s); if (match.find()) { // System.out.println("------------------------"); r.add(s); // System.out.println(s); } } return r; } public static ArrayList<String> findProblem(ArrayList<String> pdf) { String patternOne = "(.*)[Pp]roblem(.*)"; String patternTwo = "(.*)[Oo]vercomes(.*)"; String patternThree = "(.*)[Ss]olve(.*)"; String patternFour = "(.*)[Ss]olution(.*)"; ArrayList<String> r = new ArrayList<>(); String pattern = "(" + patternOne + "|" + patternTwo + "|" + patternThree + "|" + patternFour + ")"; Pattern p = Pattern.compile(pattern); for (String s : pdf) { Matcher match = p.matcher(s); if (match.find()) { // System.out.println("------------------------"); r.add(s); // System.out.println(s); } } return r; } public static ArrayList<String> findMethod(ArrayList<String> pdf) { String patternOne = "(.*)[Mm]ethodology(.*)"; String patternTwo = "(.*)[Aa]nalisys(.*)"; String patternThree = "(.*)[Aa]nalise(.*)"; String patternFour = "(.*)[Cc]ontent analisys(.*)"; ArrayList<String> r = new ArrayList<>(); String pattern = "(" + patternOne + "|" + patternTwo + "|" + patternThree + "|" + patternFour + ")"; Pattern p = Pattern.compile(pattern); for (String s : pdf) { Matcher match = p.matcher(s); if (match.find()) { // System.out.println("------------------------"); r.add(s); // System.out.println(s); } } return r; } public static ArrayList<String> findContribution(ArrayList<String> pdf) { String patternOne = "(.*)[Cc]ontributes to(.*)"; String patternTwo = "(.*)[Hh]elps(.*)"; String patternThree = "(.*)[Ss]olves(.*)"; ArrayList<String> r = new ArrayList<>(); String pattern = "(" + patternOne + "|" + patternTwo + "|" + patternThree + ")"; Pattern p = Pattern.compile(pattern); for (String s : pdf) { Matcher match = p.matcher(s); if (match.find()) { // System.out.println("------------------------"); r.add(s); // System.out.println(s); } } return r; } public static ArrayList<String> extractReferences(String pdf) { String patternOne = "(.*)\\([0-9]*\\),(\\s*)(.*)?,.*"; String patternTwo = "(.*)\\([0-9]*\\),(\\s*)(.*)\\n(.*)?,(.*)\\."; String patternThree = "(.*),(\\s*)[0-9]*,(\\s*)(.*),(.*)"; String patternFour = "(.*),(\\s*)[0-9]*,(\\s*)(.*)\\n(.*),"; String patternFive = "(.*)\\[[0-9]*\\](\\s*)([A-Z][a-z]+|[A-Z]\\.)(.*)"; String patternSix = "(.*),(\\s*)[0-9]*(\\s*)\\.(.*)"; String patternSeven = "(.*)(\\s*)\\([0-9]{4}\\)(\\s*)(.*)"; String patternEight = "(.*).(\\s*)[0-9]*(\\s*)\\.(.*)\\n(.*),"; ArrayList<String> r = new ArrayList<>(); String pattern = "(" + patternOne + "|" + patternTwo + "|" + patternThree + "|" + patternFour + "|" + patternFive + "|" + patternSix + "|" + patternSeven + "|" + patternEight + ")"; Pattern p = Pattern.compile(pattern); // System.out.println(pdf); Matcher match = p.matcher(pdf); while (match.find()) { // System.out.println("------------------------"); r.add(pdf.substring(match.start(), match.end())); // System.out.println(pdf.substring(match.start(), match.end())); } return r; } public static void main(String args[]) { HashMap<String, Article> artigos = new HashMap<>(); // ArrayList<String> defaultSW = new ArrayList<>(); // ArrayList<HashMap<String, Integer>> num_words = new ArrayList<>(); // // File arq = new File("defaultSW.txt"); // FileReader fr; // try { // fr = new FileReader(arq); // BufferedReader br = new BufferedReader(fr); // while (br.ready()) { // String linha = br.readLine(); // defaultSW.add(linha); // } // br.close(); // fr.close(); // } catch (FileNotFoundException ex) { // Logger.getLogger(Pln.class.getName()).log(Level.SEVERE, null, ex); // } catch (IOException ex) { // Logger.getLogger(Pln.class.getName()).log(Level.SEVERE, null, ex); // } File dir = new File("Corpus"); String[] children = dir.list(); int contador = 0; for (int i = 0; i < children.length; i++) { // if (!children[i].equals("24.pdf")) continue; System.out.println("Reading file " + children[i]); String text = pdftoText("Corpus/" + children[i]); String pdfLine = text; String text2 = text; pdfLine = pdfLine.replaceAll("[^A-Za-z0-9\\.]", " ").replaceAll("\\s+", " "); Article art = removeRefs(text); ArrayList<String> st = getStatements(pdfLine); // art.setAuthor(findAuthors(art.originalTextRaw)); // art.setAdress(findAddress(st)); // art.setObjective(findObjective(st)); // art.setProblem(findProblem(st)); // art.setMethod(findMethod(st)); // art.setContributions(findContribution(st)); art.setReferences(extractReferences(art.getRefe())); artigos.put(children[i], art); } // System.out.println("stop word"); // for (String k: artigos.keySet()){ // System.out.println("Count words of file "+k); // Article art = artigos.get(k); // art.setAllWords(countWord(art.getOriginalText())); // art.setNumWord(art.getAllWords().size()); // art.setOriginalText(removeSW(defaultSW, art.getOriginalText())); // art.setNonSW(countWord(art.getOriginalText())); // art.setNonSW(removeSmall(art.getNonSW())); // art.setSortedWords(ordWord(art.getNonSW())); // art.setTop10(top10(art.getSortedWords())); // } // // Article a = artigos.get("24.pdf"); // for (Palavra p:a.getSortedWords()){ // System.out.println(p.getPalavra()+" "+p.getQuant()); // } // System.out.println(a.getOriginalText()); // for (Palavra p: a.getTop10()){ // System.out.println(p.getPalavra()+" "+p.getQuant()); // } // for (String s : a.getAuthor()) // System.out.println(s); // for (String p: a.getReferences()){ // System.out.println("Ref: "+p); // } // for (String key : artigos.keySet()) { // System.out.println("----------------"+ key +"------------"); // for (String p : artigos.get(key).getAuthor()) { // System.out.println("Author: " + p); // } // } // for (String key : artigos.keySet()) { // System.out.println("----------------"+ key +"------------"); // for (String p : artigos.get(key).getAdress()) { // System.out.println("Address: " + p); // } // } for (String key : artigos.keySet()) { PrintWriter writer = null; try { System.out.println(key); writer = new PrintWriter("output/" + key.split("\\.")[0] + ".txt", "UTF-8"); for (String ref : artigos.get(key).getReferences()) { writer.println(ref); } writer.println(artigos.get(key).getReferences().size()); writer.close(); } catch (FileNotFoundException ex) { ex.printStackTrace(); } catch (UnsupportedEncodingException ex) { ex.printStackTrace(); } finally { if (writer != null) writer.close(); } } } }