package gate.translate.google;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.Gate;
import gate.Resource;
import gate.Utils;
import gate.alignment.Alignment;
import gate.compound.CompoundDocument;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.InvalidOffsetException;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import com.google.api.translate.Language;
import com.google.api.translate.Translate;
/**
* The PR uses google translation service to translate documents from one
* language to the other. It requires users to provide a compound document as
* input. It also asks users to specify the member document, the language of the
* member document and the target langauge that user wants to translate into.
* User can also provide a unit of translation if user wants to align the
* translation automatically.
*
* @author niraj
*/
@CreoleResource(name = "Google Translator PR", comment = "Runs a google translator over the source member document and produces the translated document. User can also specify if he/she wants to align unitOfTranslation in the source and the target documents.")
public class GoogleTranslatorPR extends gate.creole.AbstractLanguageAnalyser {
/**
*
*/
private static final long serialVersionUID = -8443994795704361590L;
/**
* Used internally - this is the document that will be used for holding the
* original document and the composite documents.
*/
private CompoundDocument compoundDoc;
/**
* Unit of translation.
*/
private String unitOfTranslation;
/**
* name of the alignment feature that should be used for storing
* unitOfTranslation alignment
*/
private String alignmentFeatureName;
/**
* Input annotation set name, incase unit of Translation is specified.
*/
private String inputASName;
/**
* Id of the source document that needs to be translated.
*/
private String sourceDocumentId;
/**
* Id of the target document that needs to be created as a result of
* translation.
*/
private String targetDocumentId;
/**
* Language of the source document.
*/
private Language sourceLanguage;
/**
* Language of the target document.
*/
private Language targetLanguage;
/**
* Site referrer that is needed by the google.
*/
private String siteReferrer;
/** Initialise this resource, and return it. */
public Resource init() throws ResourceInstantiationException {
if(siteReferrer == null) { throw new ResourceInstantiationException(
"Invalid value for siteReferrer:" + siteReferrer); }
Translate.setHttpReferrer(siteReferrer);
return this;
}
/* this method is called to reinitialize the resource */
public void reInit() throws ResourceInstantiationException {
// reinitialization code
init();
}
/**
* Should be called to execute this PR on a document.
*/
public void execute() throws ExecutionException {
// if no document provided
if(document == null) { throw new ExecutionException("Document is null!"); }
if(!(document instanceof CompoundDocument))
throw new ExecutionException(
"Document must be an instance of compound document!");
if(sourceDocumentId == null)
throw new ExecutionException("Source document id can not be null");
compoundDoc = (CompoundDocument)document;
// obtain source document
Document sourceDoc = compoundDoc.getDocument(sourceDocumentId);
if(sourceDoc == null) { throw new ExecutionException(
"Invalid sourceDocumentId:" + sourceDocumentId
+ " - no member document found with id:" + sourceDocumentId); }
if(unitOfTranslation == null || unitOfTranslation.trim().length() == 0) { throw new ExecutionException(
"unitOfTranslation cannot be null"); }
if(targetDocumentId == null)
throw new ExecutionException("Target document id can not be null");
// annotation set to use
AnnotationSet set =
inputASName == null || inputASName.trim().length() == 0 ? sourceDoc
.getAnnotations() : sourceDoc.getAnnotations(inputASName);
set = set.get(unitOfTranslation);
if(set == null || set.isEmpty()) { throw new ExecutionException(
"No annotations found of the type:" + unitOfTranslation); }
List<String> textsToTranslate = new ArrayList<String>();
List<Annotation> annotations = new ArrayList<Annotation>();
annotations = Utils.inDocumentOrder(set);
for(Annotation a : annotations) {
textsToTranslate.add(Utils.stringFor(sourceDoc, a));
}
// we will modify the following buffer with translated text
StringBuffer targetDocumentText =
new StringBuffer("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<Document>\n");
// maximum length to translate
int MAX_LENGTH = 200;
StringBuffer toTrans = new StringBuffer();
for(int i = 0; i < textsToTranslate.size(); i++) {
if(textsToTranslate.get(i).trim().length() == 0) continue;
toTrans.append(textsToTranslate.get(i) + "<br>");
if(toTrans.length() > MAX_LENGTH || i == textsToTranslate.size() - 1) {
try {
String result =
Translate.execute(toTrans.toString(), sourceLanguage,
targetLanguage);
String[] translatedTexts = result.split("(<br>)");
for(int j = 0; j < translatedTexts.length; j++) {
targetDocumentText.append("<" + unitOfTranslation + ">"
+ encodeXml(translatedTexts[j]) + "</" + unitOfTranslation
+ ">");
targetDocumentText.append("\n");
}
} catch(Exception e) {
throw new ExecutionException(e);
}
toTrans = new StringBuffer();
}
}
// first create a new Document
Document targetDoc;
try {
File newFile =
new File(System.getProperty("java.io.tmpdir"), targetDocumentId
+ Gate.genSym() + ".xml");
BufferedWriter bw =
new BufferedWriter(new OutputStreamWriter(new FileOutputStream(
newFile), "UTF-8"));
targetDocumentText.append("</Document>");
bw.write(targetDocumentText.toString());
bw.close();
targetDoc = Factory.newDocument(newFile.toURI().toURL(), "UTF-8");
targetDoc.setName(targetDocumentId);
} catch(ResourceInstantiationException e) {
throw new ExecutionException(e);
} catch(MalformedURLException e) {
throw new ExecutionException(e);
} catch(UnsupportedEncodingException e) {
throw new ExecutionException(e);
} catch(FileNotFoundException e) {
throw new ExecutionException(e);
} catch(IOException e) {
throw new ExecutionException(e);
}
compoundDoc.addDocument(targetDocumentId, targetDoc);
AnnotationSet targetSet =
targetDoc.getAnnotations("Original markups").get(unitOfTranslation);
List<Annotation> targetAnnots = Utils.inDocumentOrder(targetSet);
AnnotationSet outputAS =
inputASName == null || inputASName.trim().length() == 0 ? targetDoc
.getAnnotations() : targetDoc.getAnnotations(inputASName);
Alignment alignment =
compoundDoc.getAlignmentInformation(alignmentFeatureName);
String asName =
inputASName == null || inputASName.trim().length() == 0
? null
: inputASName;
for(int i = 0; i < annotations.size(); i++) {
Annotation srcAnnot = annotations.get(i);
Annotation tgtAnnot = targetAnnots.get(i);
try {
Integer id =
outputAS.add(tgtAnnot.getStartNode().getOffset(), tgtAnnot
.getEndNode().getOffset(), tgtAnnot.getType(), Factory
.newFeatureMap());
Annotation toAlign = outputAS.get(id);
alignment
.align(srcAnnot, asName, sourceDoc, toAlign, asName, targetDoc);
} catch(InvalidOffsetException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public static String encodeXml(String str) {
str = str.replaceAll("&", "&");
str = str.replaceAll("<", "<");
str = str.replaceAll(">", "&rt;");
str = str.replaceAll("\"", """);
str = str.replaceAll("\'", "'");
return str;
}
/**
* Annotation set to use for obtaining segment annotations and the annotations
* to copy into the composite document.
*
* @return
*/
public String getInputASName() {
return inputASName;
}
@Optional
@RunTime
@CreoleParameter
public void setInputASName(String inputAS) {
this.inputASName = inputAS;
}
public String getUnitOfTranslation() {
return unitOfTranslation;
}
@RunTime
@CreoleParameter(defaultValue = "Sentence")
public void setUnitOfTranslation(String unitOfTranslation) {
this.unitOfTranslation = unitOfTranslation;
}
public String getAlignmentFeatureName() {
return this.alignmentFeatureName;
}
@RunTime
@CreoleParameter(defaultValue = "sentence-alignment")
public void setAlignmentFeatureName(String alignmentFeatureName) {
this.alignmentFeatureName = alignmentFeatureName;
}
public String getSourceDocumentId() {
return sourceDocumentId;
}
@RunTime
@CreoleParameter
public void setSourceDocumentId(String sourceDocumentId) {
this.sourceDocumentId = sourceDocumentId;
}
public String getTargetDocumentId() {
return targetDocumentId;
}
@RunTime
@CreoleParameter
public void setTargetDocumentId(String targetDocumentId) {
this.targetDocumentId = targetDocumentId;
}
public Language getSourceLanguage() {
return sourceLanguage;
}
@RunTime
@CreoleParameter
public void setSourceLanguage(Language sourceLanguage) {
this.sourceLanguage = sourceLanguage;
}
public Language getTargetLanguage() {
return targetLanguage;
}
@RunTime
@CreoleParameter
public void setTargetLanguage(Language targetLanguage) {
this.targetLanguage = targetLanguage;
}
public String getSiteReferrer() {
return siteReferrer;
}
@CreoleParameter
public void setSiteReferrer(String siteReferrer) {
this.siteReferrer = siteReferrer;
}
} // class Google Translator PR
|