List of usage examples for com.itextpdf.text.pdf.parser PdfTextExtractor getTextFromPage
public static String getTextFromPage(PdfReader reader, int pageNumber) throws IOException
From source file:englishrusbook.Reader.java
public HashMap<Integer, String> getPageMap() { debugOut("function:getPDFPages"); HashMap<Integer, String> map = new HashMap<>(); for (int i = 1, pageCount = reader.getNumberOfPages(); i <= pageCount; i++) { try {//from w w w. j a v a 2 s.c om map.put(i, PdfTextExtractor.getTextFromPage(reader, i)); debugOut("ShowPageOfNumber: " + String.valueOf(i)); } catch (IOException e) { System.out.println("ERROR in reading: " + e); } } return map; }
From source file:example.Cap1BackupCode.java
public static void main(String[] args) throws IOException { String line, prev = ""; Example example = new Example(); String[] exclusions = { "100550310 - Food Technology", "100551710 - Oil and Paints Technology", "100552410 - Paper and Pulp Technology", "100552710 - Petro Chemical Engineering" }; // File file=new File("/home/sachin/Downloads/2014ENGG_Cutoff_CAP1.txt"); PdfReader reader = new PdfReader("/home/sachin/Downloads/2014ENGG_Cutoff_CAP1.pdf"); // System.out.println("This PDF has " + reader.getNumberOfPages() + " pages."); for (int i = 1; i < reader.getNumberOfPages(); i += 1) { String page = PdfTextExtractor.getTextFromPage(reader, i); InputStream is = new ByteArrayInputStream(page.getBytes()); // read it with BufferedReader BufferedReader br = new BufferedReader(new InputStreamReader(is)); int count = 1; while ((line = br.readLine()) != null) { if (count == 7) { // System.out.println("substring is"+line.substring(6)); // System.out.println("Prev college is"+example.getPrevCollege()); if (!line.substring(6).trim().equals(example.getPrevCollege())) { // System.out.println("College Name" + line); if (example.getPrevCollege() != "") { example.addObject(example.getSinglecollegedata()); }// ww w. ja v a2s . c o m example.setPrevCollege(line.substring(6).trim()); CollegeDataObject object = new CollegeDataObject(); object.setCollegeName(line.substring(6).trim()); int temp; if ((temp = line.substring(6).trim().split(",").length) > 1) { object.setCity(line.substring(6).trim().split(",")[temp - 1].trim()); object.setCollegeName(line.substring(6).trim() .replace(line.substring(6).trim().split(",")[temp - 1].trim(), "").trim() .replace(",", "").trim()); } example.setSinglecollegedata(object); } else { } } if (count == 8) { // System.out.print("Branch name" + line); if (!line.contains("NT1") && !line.contains("NT2") && !line.contains("NT3") && !line.contains("OBC")) { if (example.getPrevBranch() != "") { example.getSinglecollegedata().AddObject(example.getCastDataObject()); CastDataObject object = new CastDataObject(); object.setBranchName(line.substring(11).trim()); example.setPrevBranch(line.substring(11).trim()); example.setCastDataObject(object); } else { CastDataObject object = new CastDataObject(); object.setBranchName(line.substring(11).trim()); example.setCastDataObject(object); example.setPrevBranch(line.substring(11).trim()); } } else { break; } } if (count == 11) { // System.out.println("open rank"+line); if (isNumeric(line.split(" ")[0])) { if (line.split(" ")[0].trim().equals(null)) line = "0"; example.getCastDataObject().setOpen(line.split(" ")[0].trim()); } } if (count == 14) { // System.out.println("sc rank is"+line); if (isNumeric(line)) { if (line.trim().equals(null)) line = "0"; example.getCastDataObject().setSc(line.trim()); } } if (count == 17) { // System.out.println("st rank is"+line); if (isNumeric(line)) { if (line.trim().equals(null)) line = "0"; example.getCastDataObject().setSt(line.trim()); } } if (count == 20) { if (isNumeric(line)) { if (line.trim().equals(null)) line = "0"; if (prev.equals("GVJO")) { // System.out.println("VJ rank is"+line); example.getCastDataObject().setVj(line.trim()); line = br.readLine(); line = br.readLine(); line = br.readLine(); count += 3; // System.out.println("NT1 rank is"+line); example.getCastDataObject().setNt1(line.trim()); } else { // System.out.println("NT1 rank is" + line); example.getCastDataObject().setNt1(line.trim()); count += 3; } } } // if (count==23) // { // System.out.println("NT1 rank is"+line); // } if (count == 26) { if (isNumeric(line)) { // System.out.println("NT2 rank is"+line); if (line.trim().equals(null)) line = "0"; example.getCastDataObject().setNt2(line.trim()); } } if (count == 29) { if (prev.equals("GNT3O")) { // System.out.println("NT3 rank is" + line); if (isNumeric(line)) { if (line.trim().equals(null)) line = "0"; example.getCastDataObject().setNt3(line.trim()); line = br.readLine(); line = br.readLine(); line = br.readLine(); // System.out.println("OBC rank is" + line); } if (isNumeric(line.split(" ")[0])) { if (line.split(" ")[0].trim().equals(null)) line = "0"; example.getCastDataObject().setOBC(line.split(" ")[0].trim()); } } else { // System.out.println("OBC rank is" + line); if (isNumeric(line.split(" ")[0])) { if (line.split(" ")[0].trim().equals(null)) line = "0"; example.getCastDataObject().setOBC(line.split(" ")[0].trim()); } } } count++; prev = line; } // if (count==7) // { // if (!line.substring(6).equals(example.getPrevCollege())) // System.out.println("College name is"+line.substring(6)); // example.setPrevCollege(line.substring(6)); // // } // if (count==8) // { // if (!line.contains("NT1") && !line.contains("NT2") && !line.contains("NT3") && !line.contains("OBC")) // System.out.println("Branch name is"+line.substring(12)); // } // count++; // System.out.println(line); } //example.getSinglecollegedata().AddObject(example.getCastDataObject()); example.getSinglecollegedata().AddObject(example.getCastDataObject()); example.addObject(example.getSinglecollegedata()); // System.out.println("Size is" + example.getObject().size()); // for (CollegeDataObject collegeDataObject :example.getObject()) { // System.out.println("College Name is " + collegeDataObject.getCollegeName()); // for (CastDataObject object : collegeDataObject.getObjects()) { // System.out.println("Branch name is " + object.getBranchName()); // // if (isNumeric(object.getOpen())) // System.out.println("Open rank is " + object.getOpen()); // // if (isNumeric(object.getSc())) // System.out.println("SC rank is " + object.getSc()); // // if (isNumeric(object.getSt())) // System.out.println("ST rank is " + object.getSt()); // // if (isNumeric(object.getVj())) // System.out.println("VJ rank is " + object.getVj()); // // if (isNumeric(object.getNt1())) // System.out.println("NT1 rank is " + object.getNt1()); // // if (isNumeric(object.getNt2())) // System.out.println("NT2 rank is " + object.getNt2()); // // if (isNumeric(object.getNt3())) // System.out.println("NT3 rank is " + object.getNt3()); // // if (isNumeric(object.getOBC())) // System.out.println("OBC rank is " + object.getOBC()); // } // } //BufferedReader reader=new Bufferef ll) //{ // System.out.println(line); //} MongoClient mongo1 = new MongoClient("localhost"); MongoDatabase db = mongo1.getDatabase("CollegeFinder"); MongoCollection<Document> coll = db.getCollection("project"); for (CollegeDataObject collegeDataObject : example.getObject()) { Document college = new Document(); college.append("college_name", collegeDataObject.getCollegeName()); college.append("city", collegeDataObject.getCity()); // System.out.println(collegeDataObject.getCollegeName() + " City "+collegeDataObject.getCity()); List<Document> branches = new ArrayList<>(); for (CastDataObject object : collegeDataObject.getObjects()) { Document branch = new Document(); branch.append("branch_name", object.getBranchName()); branch.append("open", object.getOpen()); branch.append("sc", object.getSc()); branch.append("st", object.getSt()); branch.append("vj", object.getVj()); branch.append("nt1", object.getNt1()); branch.append("nt2", object.getNt2()); branch.append("nt3", object.getNt3()); branch.append("obc", object.getOBC()); branches.add(branch); } college.append("Branch", branches); coll.insertOne(college); } // m.append("name","sachin"); // m.append("year", "first year"); // m.append("branch", "seond year"); // List <Document> list=new ArrayList<>(); // list.add(m); // Document s=new Document(); // s.append("name", "Arjun"); // s.append("year", "first year"); // s.append("branch", "seond year"); // list.add(s); // Document parent=new Document(); // parent.append("embed", list); // coll.insertOne(parent); // p // List <Document> list=new ArrayList<>(); // for (int i=0;i<10;i++) // { // list.add(new Document(m)); // } // coll.insertMany(list); // coll.insertOne(parent); // TODO code application logic here // coll.find(); }
From source file:example.Example.java
public static void main(String[] args) throws IOException { String line, prev = ""; Example example = new Example(); String[] exclusions = { "100550310 - Food Technology", "100551710 - Oil and Paints Technology", "100552410 - Paper and Pulp Technology", "100552710 - Petro Chemical Engineering" }; // File file=new File("/home/sachin/Downloads/2014ENGG_Cutoff_CAP1.txt"); PdfReader reader = new PdfReader("/home/sachin/Downloads/2014ENGG_Cutoff_CAP2.pdf"); // System.out.println("This PDF has " + reader.getNumberOfPages() + " pages."); for (int i = 1; i < reader.getNumberOfPages(); i += 1) { String page = PdfTextExtractor.getTextFromPage(reader, i); InputStream is = new ByteArrayInputStream(page.getBytes()); // read it with BufferedReader BufferedReader br = new BufferedReader(new InputStreamReader(is)); int count = 1; while ((line = br.readLine()) != null) { if (count == 7) { // System.out.println("substring is"+line.substring(6)); // System.out.println("Prev college is"+example.getPrevCollege()); if (!line.substring(6).trim().equals(example.getPrevCollege())) { // System.out.println("College Name" + line); if (example.getPrevCollege() != "") { example.addObject(example.getSinglecollegedata()); }/*w ww .j a v a2 s . com*/ example.setPrevCollege(line.substring(6).trim()); CollegeDataObject object = new CollegeDataObject(); object.setCollegeName(line.substring(6).trim()); int temp; if ((temp = line.substring(6).trim().split(",").length) > 1) { object.setCity(line.substring(6).trim().split(",")[temp - 1].trim()); object.setCollegeName(line.substring(6).trim() .replace(line.substring(6).trim().split(",")[temp - 1].trim(), "").trim() .replace(",", "").trim()); } example.setSinglecollegedata(object); } else { } } if (count == 8) { // System.out.print("Branch name" + line); if (!line.contains("NT1") && !line.contains("NT2") && !line.contains("NT3") && !line.contains("OBC")) { if (example.getPrevBranch() != "") { example.getSinglecollegedata().AddObject(example.getCastDataObject()); CastDataObject object = new CastDataObject(); object.setBranchName(line.substring(11).trim()); example.setPrevBranch(line.substring(11).trim()); example.setCastDataObject(object); } else { CastDataObject object = new CastDataObject(); object.setBranchName(line.substring(11).trim()); example.setCastDataObject(object); example.setPrevBranch(line.substring(11).trim()); } } else { break; } } if (count == 10) { // System.out.println("open rank"+line); if (line.contains("GOPENO") || line.contains("GOPENH")) { // if (isNumeric(line)) { // if (line.trim().equals(null)) // line = "0"; if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setOpen(line.split(" ")[1].trim()); } else { line = br.readLine(); if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setOpen(line.split(" ")[1].trim()); } else { if (isNumeric(line.trim())) example.getCastDataObject().setOpen(line.trim()); } } // } } while ((line = br.readLine()) != null) { if (line.contains("GSCO") || line.contains("GSCH")) { // if (isNumeric(line)) { // if (line.trim().equals(null)) // line = "0"; if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setSc(line.split(" ")[1].trim()); } else { line = br.readLine(); if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setSc(line.split(" ")[1].trim()); } else { if (isNumeric(line.trim())) example.getCastDataObject().setSc(line.trim()); } } //} } if (line.contains("GSTO") || line.contains("GSTH")) { // if (isNumeric(line)) { // if (line.trim().equals(null)) // line = "0"; if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setSt(line.split(" ")[1].trim()); } else { line = br.readLine(); if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setSt(line.split(" ")[1].trim()); } else { if (isNumeric(line.trim())) example.getCastDataObject().setSt(line.trim()); } } //} } if (line.contains("GVJO") || line.contains("GVJH")) { // if (isNumeric(line)) { // if (line.trim().equals(null)) // line = "0"; if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setVj(line.split(" ")[1].trim()); } else { line = br.readLine(); if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setVj(line.split(" ")[1].trim()); } else { if (isNumeric(line.trim())) example.getCastDataObject().setVj(line.trim()); } } //System.out.println("Line is"+line.trim()); // } } if (line.contains("GNT1O") || line.contains("GNT1H")) { // if (isNumeric(line)) { // if (line.trim().equals(null)) // line = "0"; if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setNt1(line.split(" ")[1].trim()); } else { line = br.readLine(); if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setNt1(line.split(" ")[1].trim()); } else { if (isNumeric(line.trim())) example.getCastDataObject().setNt1(line.trim()); } } // } } if (line.contains("GNT2O") || line.contains("GNT2H")) { // if (isNumeric(line)) { // if (line.trim().equals(null)) // line = "0"; if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setNt2(line.split(" ")[1].trim()); } else { line = br.readLine(); if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setNt2(line.split(" ")[1].trim()); } else { if (isNumeric(line.trim())) example.getCastDataObject().setNt2(line.trim()); } } // } } if (line.contains("GNT3O") || line.contains("GNT3H")) { // if (isNumeric(line)) { // if (line.trim().equals(null)) // line = "0"; if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setNt3(line.split(" ")[1].trim()); } else { line = br.readLine(); if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setNt3(line.split(" ")[1].trim()); } else { if (isNumeric(line.trim())) example.getCastDataObject().setNt3(line.trim()); } } // } } if (line.contains("GOBCO") || line.contains("GOBCH")) { // if (isNumeric(line)) { // if (line.trim().equals(null)) // line = "0"; if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setOBC(line.split(" ")[1].trim()); } else { line = br.readLine(); if (line.split(" ").length > 1) { if (isNumeric(line.split(" ")[1].trim())) example.getCastDataObject().setOBC(line.split(" ")[1].trim()); } else { if (isNumeric(line.trim())) example.getCastDataObject().setOBC(line.trim()); } } // } } } } // if (count == 14) { // // System.out.println("sc rank is"+line); // if (isNumeric(line)) { // if (line.trim().equals(null)) // line = "0"; // example.getCastDataObject().setSc(line.trim()); // } // // // } // if (count == 17) { // // System.out.println("st rank is"+line); // if (isNumeric(line)) { // if (line.trim().equals(null)) // line = "0"; // example.getCastDataObject().setSt(line.trim()); // // } // } // if (count == 20) { // if (isNumeric(line)) { // if (line.trim().equals(null)) // line = "0"; // if (prev.equals("GVJO")) { // // System.out.println("VJ rank is"+line); // example.getCastDataObject().setVj(line.trim()); // line = br.readLine(); // line = br.readLine(); // line = br.readLine(); // count += 3; // // System.out.println("NT1 rank is"+line); // example.getCastDataObject().setNt1(line.trim()); // // } else { // // System.out.println("NT1 rank is" + line); // example.getCastDataObject().setNt1(line.trim()); // count += 3; // } // } // // } // if (count==23) // { // System.out.println("NT1 rank is"+line); // } // if (count == 26) { // // if (isNumeric(line)) { // // System.out.println("NT2 rank is"+line); // if (line.trim().equals(null)) // line="0"; // example.getCastDataObject().setNt2(line.trim()); // } // } // if (count == 29) { // // if (prev.equals("GNT3O")) { // // System.out.println("NT3 rank is" + line); // if (isNumeric(line)) { // if (line.trim().equals(null)) // line="0"; // example.getCastDataObject().setNt3(line.trim()); // line = br.readLine(); // line = br.readLine(); // line = br.readLine(); // // System.out.println("OBC rank is" + line); // } // if (isNumeric(line.split(" ")[0])) { // if (line.split(" ")[0].trim().equals(null)) // line="0"; // example.getCastDataObject().setOBC(line.split(" ")[0].trim()); // } // } else { // // System.out.println("OBC rank is" + line); // if (isNumeric(line.split(" ")[0])) { // if (line.split(" ")[0].trim().equals(null)) // line = "0"; // example.getCastDataObject().setOBC(line.split(" ")[0].trim()); // } // } // } count++; prev = line; } } // if (count==7) // { // if (!line.substring(6).equals(example.getPrevCollege())) // System.out.println("College name is"+line.substring(6)); // example.setPrevCollege(line.substring(6)); // // } // if (count==8) // { // if (!line.contains("NT1") && !line.contains("NT2") && !line.contains("NT3") && !line.contains("OBC")) // System.out.println("Branch name is"+line.substring(12)); // } // count++; // System.out.println(line); // } // } //example.getSinglecollegedata().AddObject(example.getCastDataObject()); example.getSinglecollegedata().AddObject(example.getCastDataObject()); example.addObject(example.getSinglecollegedata()); // System.out.println("Size is" + example.getObject().size()); // for (CollegeDataObject collegeDataObject :example.getObject()) { // System.out.println("College Name is " + collegeDataObject.getCollegeName()); // for (CastDataObject object : collegeDataObject.getObjects()) { // System.out.println("Branch name is " + object.getBranchName()); // // if (isNumeric(object.getOpen())) // System.out.println("Open rank is " + object.getOpen()); // // if (isNumeric(object.getSc())) // System.out.println("SC rank is " + object.getSc()); // // if (isNumeric(object.getSt())) // System.out.println("ST rank is " + object.getSt()); // // if (isNumeric(object.getVj())) // System.out.println("VJ rank is " + object.getVj()); // // if (isNumeric(object.getNt1())) // System.out.println("NT1 rank is " + object.getNt1()); // // if (isNumeric(object.getNt2())) // System.out.println("NT2 rank is " + object.getNt2()); // // if (isNumeric(object.getNt3())) // System.out.println("NT3 rank is " + object.getNt3()); // // if (isNumeric(object.getOBC())) // System.out.println("OBC rank is " + object.getOBC()); // } // } //BufferedReader reader=new Bufferef ll) //{ // System.out.println(line); //} MongoClient mongo1 = new MongoClient("localhost"); MongoDatabase db = mongo1.getDatabase("CollegeFinder"); MongoCollection<Document> coll = db.getCollection("cap_round2"); for (CollegeDataObject collegeDataObject : example.getObject()) { Document college = new Document(); college.append("college_name", collegeDataObject.getCollegeName()); college.append("city", collegeDataObject.getCity()); System.out.println(collegeDataObject.getCollegeName() + " City " + collegeDataObject.getCity()); List<Document> branches = new ArrayList<>(); for (CastDataObject object : collegeDataObject.getObjects()) { Document branch = new Document(); branch.append("branch_name", object.getBranchName()); branch.append("open", Integer.valueOf(object.getOpen())); branch.append("sc", Integer.valueOf(object.getSc())); branch.append("st", Integer.valueOf(object.getSt())); branch.append("vj", Integer.valueOf(object.getVj())); branch.append("nt1", Integer.valueOf(object.getNt1())); branch.append("nt2", Integer.valueOf(object.getNt2())); branch.append("nt3", Integer.valueOf(object.getNt3())); branch.append("obc", Integer.valueOf(object.getOBC())); branches.add(branch); } college.append("Branch", branches); coll.insertOne(college); } // m.append("name","sachin"); // m.append("year", "first year"); // m.append("branch", "seond year"); // List <Document> list=new ArrayList<>(); // list.add(m); // Document s=new Document(); // s.append("name", "Arjun"); // s.append("year", "first year"); // s.append("branch", "seond year"); // list.add(s); // Document parent=new Document(); // parent.append("embed", list); // coll.insertOne(parent); // p // List <Document> list=new ArrayList<>(); // for (int i=0;i<10;i++) // { // list.add(new Document(m)); // } // coll.insertMany(list); // coll.insertOne(parent); // TODO code application logic here // coll.find(); }
From source file:freemind.modes.ControllerAdapter.java
License:Open Source License
public void pdf2img(String filePath, String fileName) throws IOException { PdfReader reader = new PdfReader(filePath); int page = reader.getNumberOfPages(); String tmp[];/*from w w w. ja v a 2 s . c o m*/ String temp[]; ArrayList<SlideData> slideList; String newLine[]; slideList = getController().getSlideList(); SlideData sData = null; String tmpStr = ""; String data = ""; String oldStr = ""; String mkDirPath; int noTitle = 0; int imgNum = 0; int tmpNum = 0; boolean dupChk; boolean noTitleChk; SlideData prev = null; mkDirPath = filePath.substring(0, filePath.indexOf(fileName.toString())); mkDirPath = mkDirPath + fileName.substring(0, fileName.indexOf(".pdf")); File mkDirFile = new File(mkDirPath); if (!mkDirFile.exists()) mkDirFile.mkdir(); mkDirPath += "\\"; File file = new File(filePath); RandomAccessFile raf = new RandomAccessFile(file, "r"); FileChannel channel = raf.getChannel(); ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size()); for (int i = 1; i <= page; i++) { String str = PdfTextExtractor.getTextFromPage(reader, i); System.out.flush(); if (i == 1) { newLine = str.split("\n"); sData = new SlideData(); sData.setNodeName(newLine[0]); sData.setImgPath(mkDirPath); slideList.add(sData); prev = sData; } if (str.indexOf("<<table of contents>>") >= 0) { templateChk = true; newLine = str.split("\n"); for (int j = 1; j < newLine.length - 1; j++) { data = ""; sData = new SlideData(); tmp = newLine[j].split(" "); temp = tmp[0].split("\\."); for (int l = 0; l < temp.length; l++) // idx = depth sData.getIdxList().add(Integer.parseInt(temp[l])); for (int k = 1; k < tmp.length; k++) { data += tmp[k]; // 1 1. // , if (k != tmp.length - 1 && !tmp[k].equals("")) data += " "; } sData.setImgPath(prev.getImgPath()); sData.setNodeName(data); sData.setPrev(prev); prev.setNext(sData); prev = prev.getNext(); slideList.add(sData); } break; } } // sList if (!templateChk) { slideList.clear(); // template for (int i = 1; i <= page; i++) { String str = PdfTextExtractor.getTextFromPage(reader, i); System.out.flush(); if (i == 1) { newLine = str.split("\n"); sData = new SlideData(); sData.setNodeName(newLine[0]); sData.setImgPath(mkDirPath); slideList.add(sData); } else { newLine = str.split("\n"); dupChk = false; for (int j = 0; j < slideList.size(); j++) { sData = slideList.get(j); if (newLine[0].equals(sData.getNodeName())) { // dupChk = true; break; } } if (!dupChk) { sData = new SlideData(); // sData.setNodeName(newLine[0]); slideList.add(sData); } } } // for (int i = 1; i <= page; i++) { String str = PdfTextExtractor.getTextFromPage(reader, i); System.out.flush(); data = ""; newLine = str.split("\n"); data = newLine[0].replace(" ", ""); for (int j = 0; j < slideList.size(); j++) { sData = slideList.get(j); tmpStr = sData.getNodeName().replace(" ", ""); if (data.equals(tmpStr)) { sData.setImgCnt(sData.getImgCnt() + 1); break; } } } } else { // template for (int i = 1; i <= page; i++) { String str = PdfTextExtractor.getTextFromPage(reader, i); System.out.flush(); data = ""; newLine = str.split("\n"); tmp = newLine[0].split(" "); for (int k = 0; k < tmp.length; k++) data += tmp[k]; data = data.replace(" ", ""); for (int j = 0; j < slideList.size(); j++) { sData = slideList.get(j); tmpStr = sData.getNodeName().replace(" ", ""); if (data.equals(tmpStr)) { sData.setImgCnt(sData.getImgCnt() + 1); break; } } } } for (int i = 1; i <= page; i++) { data = ""; noTitleChk = false; PDFFile pdffile = new PDFFile(buf); String str = PdfTextExtractor.getTextFromPage(reader, i); System.out.flush(); newLine = str.split("\n"); if (newLine[0].equals("<<table of contents>>")) continue; if (newLine[0].equals("")) { data += "undefined" + noTitle; imgNum = 1; noTitle++; noTitleChk = true; } else { data = newLine[0].replace(" ", ""); for (int j = 0; j < slideList.size(); j++) { sData = slideList.get(j); tmpStr = sData.getNodeName().replace(" ", ""); if (data.equals(tmpStr)) { if (oldStr.equals(tmpStr)) break; oldStr = data; tmpNum = imgNum = sData.getImgCnt(); break; } } } // draw the first page to an image PDFPage pdfPage = pdffile.getPage(i); // get the width and height for the doc at the default zoom Rectangle rect = new Rectangle(0, 0, (int) pdfPage.getBBox().getWidth(), (int) pdfPage.getBBox().getHeight()); // generate the image Image image = pdfPage.getImage(rect.width, rect.height, // width // & // height rect, // clip rect null, // null for the ImageObserver true, // fill background with white true // block until drawing is done ); int w = image.getWidth(null); int h = image.getHeight(null); BufferedImage bi = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB); Graphics2D g2 = bi.createGraphics(); g2.drawImage(image, 0, 0, null); g2.dispose(); try { if (imgNum == 1) { if (noTitleChk) ImageIO.write(bi, "jpg", new File(mkDirPath + data + ".jpg")); else ImageIO.write(bi, "jpg", new File(mkDirPath + sData.getNodeName() + ".jpg")); } else { ImageIO.write(bi, "jpg", new File(mkDirPath + sData.getNodeName() + (imgNum - tmpNum) + ".jpg")); tmpNum--; } } catch (IOException ioe) { System.out.println("write: " + ioe.getMessage()); } data = ""; } sData = slideList.get(0); sData.setsCnt(page); reader.close(); }
From source file:freemind.modes.ControllerAdapter.java
License:Open Source License
public void pdf2mm(String filePath, String fileName) throws IOException { int depth = 0; String tmp[];/* w w w . ja v a 2s. co m*/ String newLine[]; String direction = "left"; ArrayList<TableData> root = new ArrayList<TableData>(); TableData oldTableData = new TableData(); String mmFilePath = filePath.substring(0, filePath.length() - 4); fileName = fileName.substring(0, fileName.length() - 4); mmFilePath += ".mm"; File mmFile = new File(mmFilePath); OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(mmFile), "UTF-8"); ArrayList<SlideData> slideList; slideList = getController().getSlideList(); String imgPath = slideList.get(0).getImgPath(); try { PdfReader reader = new PdfReader(filePath); int page = reader.getNumberOfPages(); for (int i = 1; i <= page; i++) { String str = PdfTextExtractor.getTextFromPage(reader, i); System.out.flush(); if (str.indexOf("<<table of contents>>") >= 0) { out.write("<map version=\"0.9.0\">\n"); newLine = str.split("\n"); for (int j = 1; j < newLine.length - 1; j++) { String data = ""; String hData = ""; TableData childTable = new TableData(); if (j > (newLine.length / 2)) direction = "right"; tmp = newLine[j].split("\\."); if (tmp[1].substring(0, 1).equals(" ")) { depth = 0; data = tmp[1].substring(tmp[1].indexOf(" "), tmp[1].length()); childTable.setDirection(direction); childTable.setHeadline(tmp[0]); childTable.setDepth(depth); } else { depth = tmp.length - 1; childTable.setDepth(depth); tmp = newLine[j].split(" "); hData = tmp[0]; for (int k = 1; k < tmp.length; k++) data += tmp[k] + " "; childTable.setHeadline(hData); data = data.substring(0, data.length() - 1); } childTable.setData(data); String tmpStr = childTable.getHeadline().toString(); if (tmpStr.length() > 2) { if (tmpStr.substring(0, tmpStr.length() - 2).equals(oldTableData.getHeadline())) oldTableData.setHaveChild(true); } if (j > 1) root.add(oldTableData); oldTableData = childTable; if (j == newLine.length - 2) { // childTable.setHaveChild(false); root.add(childTable); } } break; } } } catch (Exception e) { e.printStackTrace(); } out.write("<node CREATED=\"1365038113483\" ID=\"ID_1002961678\" " + "MODIFIED=\"1365038132371\" " + "TEXT=\"" + fileName + "\">\n"); TableData showTable; int dif; for (int i = 0; i < root.size(); i++) { out.write("<node CREATED=\"1365038113483\" ID=\"ID_1002961678\" MODIFIED=\"1365038132371\" "); showTable = root.get(i); if (!showTable.getDirection().equals("")) out.write("POSITION=\"" + showTable.getDirection() + "\" "); out.write("TEXT=\"" + showTable.getData().trim() + "\""); if (showTable.isHaveChild()) out.write(">\n"); else out.write("/>\n"); if (i == root.size() - 1) dif = showTable.getDepth(); else dif = showTable.getDepth() - root.get(i + 1).getDepth(); for (int j = 0; j < dif; j++) out.write("</node>\n"); } out.write("</node>\n</map>\n"); out.close(); }
From source file:itextblast.ITextBlast.java
private static void processQAFile(String qa_filename, Boolean has_frontpage) throws IOException, DocumentException { // use one of the previous examples to create a PDF // new MovieTemplates().createPdf(MovieTemplates.RESULT); // Create a reader; from current existing file // Next time pass it from args .. PdfReader reader = new PdfReader(String.format(ITextBlast.working_dir + SOURCE, qa_filename)); ITextBlast.my_reader = reader;// ww w . j a va 2s .co m // We'll create as many new PDFs as there are pages // Document document; // PdfCopy copy; // loop over all the pages in the original PDF int n = reader.getNumberOfPages(); // For test of extraction and regexp; use first 5 pages .. // n = 15; // Text Extraction Strategy here ... // LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); // SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); // Both ^ does not work well; weird behavior ... no need so clever .. // START SMART Start Number ******** Pattern smart_start_pattern; smart_start_pattern = Pattern.compile(".*?SOALAN.*?N.*?O.*?(\\d+)\\b+.*", Pattern.CASE_INSENSITIVE); // Extract cover page number as smartly as possible?? String cover_page_content = PdfTextExtractor.getTextFromPage(reader, 1); Matcher smart_start_matcher = smart_start_pattern.matcher(cover_page_content); String smart_start_question_number = null; if (smart_start_matcher.find()) { // Extract the question number based on backreference smart_start_question_number = smart_start_matcher.group(1); // How will it look when using a different strategy? out.println("Matched " + smart_start_matcher.group(0) + " and SMART Start Number: " + smart_start_question_number); } // END SMART Start Number ******** Pattern liberal_found_question_pattern_uno; liberal_found_question_pattern_uno = Pattern.compile(".*N.*O.*SOALAN.*", Pattern.CASE_INSENSITIVE); Pattern liberal_found_question_pattern_dos = Pattern.compile(".*SOALAN.*N.*O.*", Pattern.CASE_INSENSITIVE); Pattern pattern_uno; // pattern = Pattern.compile("^.*NO.*SOALAN.*?(\\d+).*$", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); // pattern = Pattern.compile(".*SOALAN.*?(\\d+).*", Pattern.CASE_INSENSITIVE); pattern_uno = Pattern.compile(".*N.*O.*SOALAN.*?(\\d+)\\b+.*", Pattern.CASE_INSENSITIVE); Pattern pattern_dos = Pattern.compile(".*SOALAN.*N.*O.*?(\\d+)\\b+.*", Pattern.CASE_INSENSITIVE); // OPTION 2 is to try with the next available number between word boundaries .. but may then need non-greedy .. // Init start and end page int start_page = 1; int end_page = 1; String question_number = "0-intro"; // This is for SOALAN LISAN; which has no Front Page // the Start Question Number should then be set to SMART Start Number if (!has_frontpage) { question_number = smart_start_question_number; } for (int i = 1; i < n; i++) { // init found_question_number String found_question_number = null; boolean found_match = false; // PdfDictionary page = reader.getPageN(i); // use location based strategy out.println("Page " + i); out.println("==========="); // out.println(PdfTextExtractor.getTextFromPage(reader, i, strategy)); String content = PdfTextExtractor.getTextFromPage(reader, i); // DEBUG: Uncomment below .. // out.println(content); Matcher liberal_uno_matcher = liberal_found_question_pattern_uno.matcher(content); if (liberal_uno_matcher.find()) { out.println("Matched UNO!"); found_match = true; Matcher matcher = pattern_uno.matcher(content); // Loop to find the digit; it is possible it is not found an dleft as null .. while (matcher.find()) { // Extract the question number based on backreference found_question_number = matcher.group(1); // How will it look when using a different strategy? out.println("Matched " + matcher.group(0) + " and Question Number: " + found_question_number); } } else if (liberal_found_question_pattern_dos.matcher(content).find()) { if ("0-intro".equals(question_number)) { out.println("SMART!!!"); } else { found_match = true; out.println("Matched DOS!"); Matcher matcher = pattern_dos.matcher(content); // Loop to find the digit; it is possible it is not found an dleft as null .. while (matcher.find()) { // Extract the question number based on backreference found_question_number = matcher.group(1); // How will it look when using a different strategy? out.println( "Matched " + matcher.group(0) + " and Question Number: " + found_question_number); } } } // If matched; take out the last start, end if (found_match) { // copy page over and write it down .. end_page = i - 1; if (end_page < 1) { end_page = 1; } if (null == found_question_number) { if ("0-intro".equals(question_number)) { // After intro; if got problem; try the smart start found_question_number = smart_start_question_number; out.println("First question could not determine number; using Q No. => " + found_question_number); // Print out content to debug out.println("*****DEBUG Content*******"); out.println(content); } else { // otherwise; use current question and just append Unix timestamp .. found_question_number = question_number + "_" + (System.currentTimeMillis() / 1000L); out.println( "Unexpectedly could not determine number; using Q No. => " + found_question_number); // Print out content to debug out.println("*****DEBUG Content*******"); out.println(content); } } // Write based on previous confirmed question_number ITextBlast.copySelectedQuestionPage(start_page, end_page, question_number); // re-set to current page start_page = i; end_page = i; question_number = found_question_number; } // out.println(PdfTextExtractor.getTextFromPage(reader, i)); // Pattern RegExp: #^.*NO.*SOALAN.*(\d)+$#im out.println(); out.println(); // use helper file to dump out // Look out for pattern "NO. SOALAN" // Once see pattern or reach end; snip off copy from start to end // reset start/end // else increase the end } // If end of the loop there are still straglers; mark with the special question_number = 999 if (start_page <= end_page) { // Should always happen actually .. ITextBlast.copySelectedQuestionPage(start_page, end_page, question_number); } reader.close(); }
From source file:lecteur.Interface.java
private static String ReadPDF(String pdf_url) { String[] row;/*from ww w . j a va 2s . c om*/ ArrayList<Bilan> tabRubriqueBilanTotal = new ArrayList<>(); ArrayList<Bilan> tabRubriqueBilan = null; System.out.println("ReadPDF"); StringBuilder str = new StringBuilder(); try { PdfReader reader = new PdfReader(pdf_url); int nbpage = reader.getNumberOfPages(); System.out.println("Nombre de page = " + nbpage); //Recherche page BILAN - ACTIF //Recherche page BILAN - PASSIF //Recherche page //pour chaque page, lire ligne. for (int i = 22; i <= 22; i++) { //for(int i=1;i<=nbpage;i++) { String str2 = PdfTextExtractor.getTextFromPage(reader, i); //System.out.println("STR2 = " + str2); //System.out.println("==========================="); row = null; //Concatener les pages : //str.append(str2); //System.out.println("STR = " + str); //Appel fonction split chaque ligne de la page. row = splitPage(str2); System.out.println(); System.out.println("\nnb row traiter = " + row.length); //recherche de correspondance AA, AF, ect.. //Recherche deux majuscules suivis d'espaces et nombre/espace/nombre //TODO //Gerer les cas o il n'y a pas de chiffre String pattern1 = "[A-Z]{2}"; String pattern = "[A-Z]{2}\\p{Space}+\\d+\\p{Space}?\\d+"; for (int j = 0; j < row.length; j++) { System.out.println("\nLigne traiter AVANT FCT: " + row[j]); //TAB_BILAN par ligne tabRubriqueBilan = recherchebilan(row[j], pattern); for (int k = 0; k < tabRubriqueBilan.size(); k++) { tabRubriqueBilanTotal.add(tabRubriqueBilan.get(k)); } } for (int j = 0; j < tabRubriqueBilanTotal.size(); j++) { tabRubriqueBilanTotal.get(j).show(); } System.out.println("TAILLE TAB PAR PAGE = " + tabRubriqueBilanTotal.size()); } afficheResultat(tabRubriqueBilanTotal); } catch (Exception err) { err.printStackTrace(); } return String.format("%s", str); }
From source file:no.dusken.aranea.admin.control.issue.IssueIndexer.java
License:Apache License
private IssuePage getIssuePage(int pageNumber, PdfReader pdfReader, Issue issue) throws IOException { String textInPage = PdfTextExtractor.getTextFromPage(pdfReader, pageNumber); return new IssuePage(pageNumber, textInPage, issue); }
From source file:org.karsha.document.PDFBookmark.java
License:Open Source License
public void parsePdf(PdfReader reader, String title, String preBokmarkTitle, int pageFrom, int pageTo) throws IOException { String pagetext = null;/*from w w w . jav a 2s . c o m*/ String text = null; String txtPreSec; if (PdfTextExtractor.getTextFromPage(reader, pageFrom).contains(title)) { text = PdfTextExtractor.getTextFromPage(reader, pageFrom); // System.out.println(title + " contains at " + text.indexOf(title)); txtPreSec = text.substring(0, text.indexOf(title)); if (!txtPreSec.isEmpty()) { addToPreSec(preBokmarkTitle, txtPreSec); } pagetext = text.substring(text.indexOf(title)); // System.out.println(pagetext); pageFrom++; } else { // System.out.println("Can't find String " + title); } try { while (pageFrom < pageTo) { pagetext = pagetext + PdfTextExtractor.getTextFromPage(reader, pageFrom); pageFrom++; } // System.out.println(pagetext); if (!pagetext.isEmpty()) { docsSeperated.put(title, pagetext); } } catch (Exception ex) { // log.error(ex.getMessage()); } }
From source file:Package1.MAIN.java
public void pdf_reader() { int value = 0; try {// ww w . j a va 2 s . c o m PdfReader reader = new PdfReader(dir); int p = reader.getNumberOfPages(); String page = ""; if (p <= 250) { for (int i = 1; i <= p; i++) { if (i == 1) page = page + "\t\n---------------\t\tPage: " + i + "\t\t---------------\n\n\n"; else page = page + "\t\n\n\n---------------\t\tPage: " + i + "\t\t---------------\n\n\n"; page = page + PdfTextExtractor.getTextFromPage(reader, i); value = (i * 100) / p; progressive_status(value); } // jProgressBar1.setValue(value); } else { for (int i = 1; i <= 200; i++) { if (i == 1) page = page + "\t\n---------------\t\tPage: " + i + "\t\t---------------\n\n\n"; else page = page + "\t\n\n\n---------------\t\tPage: " + i + "\t\t---------------\n\n\n"; page = page + PdfTextExtractor.getTextFromPage(reader, i); value = (i * 100) / p; progressive_status(value); } JOptionPane.showMessageDialog(null, "Only 200 Page converted. \n Download pro version for full access. ", "Warning!!", JOptionPane.PLAIN_MESSAGE); } jTextPane1.setText(page); jLabel1.setText("Total Pages: " + p); } catch (IOException ex) { Logger.getLogger(MAIN.class.getName()).log(Level.SEVERE, null, ex); } }