List of usage examples for edu.stanford.nlp.util ArrayCoreMap get
@Override @SuppressWarnings("unchecked") public <VALUE> VALUE get(Class<? extends Key<VALUE>> key)
From source file:eu.fbk.dkm.sectionextractor.pantheon.WikipediaGoodTextExtractor.java
License:Apache License
@Override public void contentPage(String text, String title, int wikiID) { if (pagesToConsider != null && !pagesToConsider.contains(title)) { return;/* w ww. j av a 2s . co m*/ } if (idCategory != null && !idCategory.keySet().contains(wikiID)) { return; } try { String okTitle = title.trim().replace('_', ' '); String folderNumberName = Integer.toString(wikiID / MAX_FILES_PER_FOLDER); String fileName = Integer.toString(wikiID); String folderName = baseFolder.getAbsolutePath() + File.separator + folderNumberName; if (idCategory.get(wikiID) != null) { folderName = baseFolder.getAbsolutePath() + File.separator + idCategory.get(wikiID) + File.separator + folderNumberName; } File folder = new File(folderName); if (!folder.exists()) { folder.mkdirs(); } if (NAFformat) { fileName += ".naf"; } else { fileName += ".txt"; } String defFileName = folder.getAbsolutePath() + File.separator + fileName; File file = new File(defFileName); StringBuffer buffer = new StringBuffer(); WikipediaText wikipediaText = new WikipediaText(); buffer.append(wikipediaText.parse(text, null)); String rawText = buffer.toString(); buffer = new StringBuffer(); buffer.append(okTitle).append("\n").append("\n"); List<String> strings = Splitter.on('\n').trimResults()./*omitEmptyStrings().*/splitToList(rawText); for (String line : strings) { if (line.startsWith(categoryPrefix)) { continue; } if (line.startsWith("new:")) { continue; } buffer.append(line).append("\n"); } rawText = buffer.toString(); if (useStanford) { Annotation myDoc = new Annotation(rawText); pipeline.annotate(myDoc); StringBuffer tokenizedString = new StringBuffer(); List<CoreMap> sents = myDoc.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap thisSent : sents) { ArrayCoreMap sentenceCoreMap = (ArrayCoreMap) thisSent; List<CoreLabel> tokens = sentenceCoreMap.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel token : tokens) { tokenizedString.append(codeToParenthesis(token.toString())).append("\n"); } tokenizedString.append(eosTag).append("\n"); } rawText = tokenizedString.toString(); } if (NAFformat) { final KAFDocument document = new KAFDocument("en", "v3"); document.setRawText(rawText); document.createPublic(); document.getPublic().uri = String.format("https://%s.wikipedia.org/wiki/%s", getLocale().getLanguage(), title); document.createFileDesc(); document.getFileDesc().author = "MediaWiki"; document.getFileDesc().filename = fileName; document.getFileDesc().filetype = "Wikipedia article"; document.getFileDesc().title = okTitle; document.save(file.getAbsolutePath()); } else { try { logger.debug("Writing file " + file.getAbsolutePath()); BufferedWriter writer = new BufferedWriter(new FileWriter(file)); writer.write(rawText); writer.close(); } catch (Exception e) { logger.error(e.getMessage()); } } } catch (Exception e) { logger.error("Error processing page " + title + " (" + wikiID + ")"); } }