Révision 927
| tmp/TXMBetaPlugins.site/site.xml (revision 927) | ||
|---|---|---|
| 1 | 1 |
<?xml version="1.0" encoding="UTF-8"?> |
| 2 | 2 |
<site> |
| 3 |
<feature url="features/org.txm.ahc.feature_1.0.0.qualifier.jar" id="org.txm.ahc.feature" version="1.0.0.qualifier"/>
|
|
| 4 |
<feature url="features/org.txm.analec.feature_1.0.0.qualifier.jar" id="org.txm.analec.feature" version="1.0.0.qualifier"/>
|
|
| 5 |
<feature url="features/org.txm.annotation.feature_1.0.0.qualifier.jar" id="org.txm.annotation.feature" version="1.0.0.qualifier"/>
|
|
| 6 |
<feature url="features/org.txm.annotation.kr.feature_1.0.0.qualifier.jar" id="org.txm.annotation.kr.feature" version="1.0.0.qualifier"/>
|
|
| 7 |
<feature url="features/org.txm.backtomedia.feature_1.0.0.qualifier.jar" id="org.txm.backtomedia.feature" version="1.0.0.qualifier"/>
|
|
| 8 |
<feature url="features/org.txm.ca.feature_1.0.0.qualifier.jar" id="org.txm.ca.feature" version="1.0.0.qualifier"/>
|
|
| 9 |
<feature url="features/org.txm.chartsengine.jfreechart.feature_1.0.0.qualifier.jar" id="org.txm.chartsengine.jfreechart.feature" version="1.0.0.qualifier"/>
|
|
| 10 |
<feature url="features/org.txm.chartsengine.r.feature_1.0.0.qualifier.jar" id="org.txm.chartsengine.r.feature" version="1.0.0.qualifier"/>
|
|
| 11 |
<feature url="features/org.txm.concordance.feature_1.0.0.qualifier.jar" id="org.txm.concordance.feature" version="1.0.0.qualifier"/>
|
|
| 12 |
<feature url="features/org.txm.cooccurrence.feature_1.0.0.qualifier.jar" id="org.txm.cooccurrence.feature" version="1.0.0.qualifier"/>
|
|
| 13 |
<feature url="features/org.txm.cql2lsa.feature_1.0.0.qualifier.jar" id="org.txm.cql2lsa.feature" version="1.0.0.qualifier"/>
|
|
| 14 |
<feature url="features/org.txm.dictionary.feature_1.0.0.qualifier.jar" id="org.txm.dictionary.feature" version="1.0.0.qualifier"/>
|
|
| 15 |
<feature url="features/org.txm.edition.feature_1.0.0.qualifier.jar" id="org.txm.edition.feature" version="1.0.0.qualifier"/>
|
|
| 16 |
<feature url="features/org.txm.groovy.feature_1.0.0.qualifier.jar" id="org.txm.groovy.feature" version="1.0.0.qualifier"/>
|
|
| 17 |
<feature url="features/org.txm.index.feature_1.0.0.qualifier.jar" id="org.txm.index.feature" version="1.0.0.qualifier"/>
|
|
| 18 |
<feature url="features/org.txm.internalview.feature_1.0.0.qualifier.jar" id="org.txm.internalview.feature" version="1.0.0.qualifier"/>
|
|
| 19 |
<feature url="features/org.txm.lexicaltable.feature_1.0.0.qualifier.jar" id="org.txm.lexicaltable.feature" version="1.0.0.qualifier"/>
|
|
| 20 |
<feature url="features/org.txm.partition.feature_1.0.0.qualifier.jar" id="org.txm.partition.feature" version="1.0.0.qualifier"/>
|
|
| 21 |
<feature url="features/org.txm.progression.feature_1.0.0.qualifier.jar" id="org.txm.progression.feature" version="1.0.0.qualifier"/>
|
|
| 22 |
<feature url="features/org.txm.properties.feature_1.0.0.qualifier.jar" id="org.txm.properties.feature" version="1.0.0.qualifier"/>
|
|
| 23 |
<feature url="features/org.txm.rcp.feature_1.0.0.qualifier.jar" id="org.txm.rcp.feature" version="1.0.0.qualifier"/>
|
|
| 24 |
<feature url="features/org.txm.referencer.feature_1.0.0.qualifier.jar" id="org.txm.referencer.feature" version="1.0.0.qualifier"/>
|
|
| 25 |
<feature url="features/org.txm.searchengine.cqp.feature_1.0.0.qualifier.jar" id="org.txm.searchengine.cqp.feature" version="1.0.0.qualifier"/>
|
|
| 26 |
<feature url="features/org.txm.specificities.feature_1.0.0.qualifier.jar" id="org.txm.specificities.feature" version="1.0.0.qualifier"/>
|
|
| 27 |
<feature url="features/org.txm.statsengine.r.feature_1.0.0.qualifier.jar" id="org.txm.statsengine.r.feature" version="1.0.0.qualifier"/>
|
|
| 28 |
<feature url="features/org.txm.svn.feature_1.0.0.qualifier.jar" id="org.txm.svn.feature" version="1.0.0.qualifier"/>
|
|
| 29 |
<feature url="features/org.txm.textbalance.feature_1.0.0.qualifier.jar" id="org.txm.textbalance.feature" version="1.0.0.qualifier"/>
|
|
| 30 |
<feature url="features/org.txm.tigersearch.feature_1.0.0.qualifier.jar" id="org.txm.tigersearch.feature" version="1.0.0.qualifier"/>
|
|
| 31 |
<feature url="features/org.txm.treetagger.feature_1.0.0.qualifier.jar" id="org.txm.treetagger.feature" version="1.0.0.qualifier"/>
|
|
| 32 |
<feature url="features/org.txm.treetagger.files.feature_1.0.0.qualifier.jar" id="org.txm.treetagger.files.feature" version="1.0.0.qualifier"/>
|
|
| 33 |
<feature url="features/org.txm.wordcloud.feature_1.0.0.qualifier.jar" id="org.txm.wordcloud.feature" version="1.0.0.qualifier"/>
|
|
| 3 |
<feature url="features/org.txm.ahc.feature_1.0.0.201806251537.jar" id="org.txm.ahc.feature" version="1.0.0.201806251537"/>
|
|
| 4 |
<feature url="features/org.txm.analec.feature_1.0.0.201806251537.jar" id="org.txm.analec.feature" version="1.0.0.201806251537"/>
|
|
| 5 |
<feature url="features/org.txm.annotation.feature_1.0.0.201806251537.jar" id="org.txm.annotation.feature" version="1.0.0.201806251537"/>
|
|
| 6 |
<feature url="features/org.txm.annotation.kr.feature_1.0.0.201806251537.jar" id="org.txm.annotation.kr.feature" version="1.0.0.201806251537"/>
|
|
| 7 |
<feature url="features/org.txm.backtomedia.feature_1.0.0.201806251537.jar" id="org.txm.backtomedia.feature" version="1.0.0.201806251537"/>
|
|
| 8 |
<feature url="features/org.txm.ca.feature_1.0.0.201806251537.jar" id="org.txm.ca.feature" version="1.0.0.201806251537"/>
|
|
| 9 |
<feature url="features/org.txm.chartsengine.jfreechart.feature_1.0.0.201806251537.jar" id="org.txm.chartsengine.jfreechart.feature" version="1.0.0.201806251537"/>
|
|
| 10 |
<feature url="features/org.txm.chartsengine.r.feature_1.0.0.201806251537.jar" id="org.txm.chartsengine.r.feature" version="1.0.0.201806251537"/>
|
|
| 11 |
<feature url="features/org.txm.concordance.feature_1.0.0.201806251537.jar" id="org.txm.concordance.feature" version="1.0.0.201806251537"/>
|
|
| 12 |
<feature url="features/org.txm.cooccurrence.feature_1.0.0.201806251537.jar" id="org.txm.cooccurrence.feature" version="1.0.0.201806251537"/>
|
|
| 13 |
<feature url="features/org.txm.cql2lsa.feature_1.0.0.201806251537.jar" id="org.txm.cql2lsa.feature" version="1.0.0.201806251537"/>
|
|
| 14 |
<feature url="features/org.txm.dictionary.feature_1.0.0.201806251537.jar" id="org.txm.dictionary.feature" version="1.0.0.201806251537"/>
|
|
| 15 |
<feature url="features/org.txm.edition.feature_1.0.0.201806251537.jar" id="org.txm.edition.feature" version="1.0.0.201806251537"/>
|
|
| 16 |
<feature url="features/org.txm.groovy.feature_1.0.0.201806251537.jar" id="org.txm.groovy.feature" version="1.0.0.201806251537"/>
|
|
| 17 |
<feature url="features/org.txm.index.feature_1.0.0.201806251537.jar" id="org.txm.index.feature" version="1.0.0.201806251537"/>
|
|
| 18 |
<feature url="features/org.txm.internalview.feature_1.0.0.201806251537.jar" id="org.txm.internalview.feature" version="1.0.0.201806251537"/>
|
|
| 19 |
<feature url="features/org.txm.lexicaltable.feature_1.0.0.201806251537.jar" id="org.txm.lexicaltable.feature" version="1.0.0.201806251537"/>
|
|
| 20 |
<feature url="features/org.txm.partition.feature_1.0.0.201806251537.jar" id="org.txm.partition.feature" version="1.0.0.201806251537"/>
|
|
| 21 |
<feature url="features/org.txm.progression.feature_1.0.0.201806251537.jar" id="org.txm.progression.feature" version="1.0.0.201806251537"/>
|
|
| 22 |
<feature url="features/org.txm.properties.feature_1.0.0.201806251537.jar" id="org.txm.properties.feature" version="1.0.0.201806251537"/>
|
|
| 23 |
<feature url="features/org.txm.rcp.feature_1.0.0.201806251537.jar" id="org.txm.rcp.feature" version="1.0.0.201806251537"/>
|
|
| 24 |
<feature url="features/org.txm.referencer.feature_1.0.0.201806251537.jar" id="org.txm.referencer.feature" version="1.0.0.201806251537"/>
|
|
| 25 |
<feature url="features/org.txm.searchengine.cqp.feature_1.0.0.201806251537.jar" id="org.txm.searchengine.cqp.feature" version="1.0.0.201806251537"/>
|
|
| 26 |
<feature url="features/org.txm.specificities.feature_1.0.0.201806251537.jar" id="org.txm.specificities.feature" version="1.0.0.201806251537"/>
|
|
| 27 |
<feature url="features/org.txm.statsengine.r.feature_1.0.0.201806251537.jar" id="org.txm.statsengine.r.feature" version="1.0.0.201806251537"/>
|
|
| 28 |
<feature url="features/org.txm.svn.feature_1.0.0.201806251537.jar" id="org.txm.svn.feature" version="1.0.0.201806251537"/>
|
|
| 29 |
<feature url="features/org.txm.textbalance.feature_1.0.0.201806251537.jar" id="org.txm.textbalance.feature" version="1.0.0.201806251537"/>
|
|
| 30 |
<feature url="features/org.txm.tigersearch.feature_1.0.0.201806251537.jar" id="org.txm.tigersearch.feature" version="1.0.0.201806251537"/>
|
|
| 31 |
<feature url="features/org.txm.treetagger.feature_1.0.0.201806251537.jar" id="org.txm.treetagger.feature" version="1.0.0.201806251537"/>
|
|
| 32 |
<feature url="features/org.txm.treetagger.files.feature_1.0.0.201806251537.jar" id="org.txm.treetagger.files.feature" version="1.0.0.201806251537"/>
|
|
| 33 |
<feature url="features/org.txm.wordcloud.feature_1.0.0.201806251537.jar" id="org.txm.wordcloud.feature" version="1.0.0.201806251537"/>
|
|
| 34 | 34 |
</site> |
| tmp/org.txm.treetagger.core/plugin.xml (revision 927) | ||
|---|---|---|
| 7 | 7 |
class="org.txm.treetagger.core.preferences.TreeTaggerPreferences"> |
| 8 | 8 |
</initializer> |
| 9 | 9 |
</extension> |
| 10 |
<extension |
|
| 11 |
point="org.txm.annotation.core.AnnotationEngine"> |
|
| 12 |
<AnnotationEngine |
|
| 13 |
class="org.txm.treetagger.core.TreeTaggerEngine" |
|
| 14 |
description="Call TreeTagger treetagger train processus to annotate corpus"> |
|
| 15 |
</AnnotationEngine> |
|
| 16 |
</extension> |
|
| 10 | 17 |
|
| 11 | 18 |
</plugin> |
| tmp/org.txm.treetagger.core/.settings/org.eclipse.jdt.core.prefs (revision 927) | ||
|---|---|---|
| 1 | 1 |
eclipse.preferences.version=1 |
| 2 | 2 |
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled |
| 3 |
org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate |
|
| 3 | 4 |
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 |
| 5 |
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve |
|
| 4 | 6 |
org.eclipse.jdt.core.compiler.compliance=1.6 |
| 7 |
org.eclipse.jdt.core.compiler.debug.lineNumber=generate |
|
| 8 |
org.eclipse.jdt.core.compiler.debug.localVariable=generate |
|
| 9 |
org.eclipse.jdt.core.compiler.debug.sourceFile=generate |
|
| 5 | 10 |
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error |
| 6 | 11 |
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error |
| 7 | 12 |
org.eclipse.jdt.core.compiler.source=1.6 |
| tmp/org.txm.treetagger.core/.settings/org.eclipse.jdt.groovy.core.prefs (revision 927) | ||
|---|---|---|
| 1 |
eclipse.preferences.version=1 |
|
| 2 |
groovy.compiler.level=25 |
|
| 3 |
groovy.script.filters=**/*.dsld,y,**/*.gradle,n |
|
| 0 | 4 | |
| tmp/org.txm.treetagger.core/.classpath (revision 927) | ||
|---|---|---|
| 7 | 7 |
</accessrules> |
| 8 | 8 |
</classpathentry> |
| 9 | 9 |
<classpathentry kind="src" path="src"/> |
| 10 |
<classpathentry exported="true" kind="con" path="GROOVY_SUPPORT"/> |
|
| 11 |
<classpathentry exported="true" kind="con" path="GROOVY_DSL_SUPPORT"/> |
|
| 10 | 12 |
<classpathentry kind="output" path="bin"/> |
| 11 | 13 |
</classpath> |
| tmp/org.txm.treetagger.core/META-INF/MANIFEST.MF (revision 927) | ||
|---|---|---|
| 1 | 1 |
Manifest-Version: 1.0 |
| 2 |
Require-Bundle: org.txm.core;bundle-version="0.8.0";visibility:=reexport |
|
| 3 |
Export-Package: org.txm.treetagger.core.preferences |
|
| 2 |
Require-Bundle: org.txm.annotation.core;visibility:=reexport |
|
| 3 |
Export-Package: org.txm.treetagger.core, |
|
| 4 |
org.txm.treetagger.core.preferences |
|
| 4 | 5 |
Bundle-ActivationPolicy: lazy |
| 5 | 6 |
Bundle-Version: 1.0.0.qualifier |
| 6 | 7 |
Bundle-Name: TreeTagger Core |
| tmp/org.txm.treetagger.core/.project (revision 927) | ||
|---|---|---|
| 22 | 22 |
</buildCommand> |
| 23 | 23 |
</buildSpec> |
| 24 | 24 |
<natures> |
| 25 |
<nature>org.eclipse.jdt.groovy.core.groovyNature</nature> |
|
| 25 | 26 |
<nature>org.eclipse.pde.PluginNature</nature> |
| 26 | 27 |
<nature>org.eclipse.jdt.core.javanature</nature> |
| 27 | 28 |
</natures> |
| tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/AnnotateWTC.groovy (revision 927) | ||
|---|---|---|
| 1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
| 2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
| 3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
| 4 |
// Sophia Antipolis, University of Paris 3. |
|
| 5 |
// |
|
| 6 |
// The TXM platform is free software: you can redistribute it |
|
| 7 |
// and/or modify it under the terms of the GNU General Public |
|
| 8 |
// License as published by the Free Software Foundation, |
|
| 9 |
// either version 2 of the License, or (at your option) any |
|
| 10 |
// later version. |
|
| 11 |
// |
|
| 12 |
// The TXM platform is distributed in the hope that it will be |
|
| 13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
| 14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
| 15 |
// PURPOSE. See the GNU General Public License for more |
|
| 16 |
// details. |
|
| 17 |
// |
|
| 18 |
// You should have received a copy of the GNU General |
|
| 19 |
// Public License along with the TXM platform. If not, see |
|
| 20 |
// http://www.gnu.org/licenses. |
|
| 21 |
// |
|
| 22 |
// |
|
| 23 |
// |
|
| 24 |
// $LastChangedDate: 2013-12-06 16:57:35 +0100 (Fri, 06 Dec 2013) $ |
|
| 25 |
// $LastChangedRevision: 2583 $ |
|
| 26 |
// $LastChangedBy: mdecorde $ |
|
| 27 |
// |
|
| 28 |
package org.txm.importer.xmltxm |
|
| 29 |
|
|
| 30 |
import java.io.File |
|
| 31 |
import java.text.DateFormat |
|
| 32 |
import java.util.Date |
|
| 33 |
|
|
| 34 |
import org.txm.Toolbox |
|
| 35 |
import org.txm.importer.* |
|
| 36 |
import org.txm.importer.cwb.* |
|
| 37 |
import org.txm.objects.* |
|
| 38 |
import org.txm.stat.utils.ConsoleProgressBar |
|
| 39 |
import org.txm.utils.DeleteDir; |
|
| 40 |
import org.txm.utils.LangDetector; |
|
| 41 |
import org.txm.utils.treetagger.TreeTagger |
|
| 42 |
|
|
| 43 |
// TODO: Auto-generated Javadoc |
|
| 44 |
/** |
|
| 45 |
* Annotate and replace the TEI-TXM files of the folder $rootDirFile/txm with TreeTagger. |
|
| 46 |
* creates $rootDirFile/interp and $rootDirFile/treetagger |
|
| 47 |
* |
|
| 48 |
*/ |
|
| 49 |
class AnnotateCQP {
|
|
| 50 |
|
|
| 51 |
/** The debug. */ |
|
| 52 |
boolean debug = true; |
|
| 53 |
|
|
| 54 |
/** |
|
| 55 |
* Sets the debug. |
|
| 56 |
*/ |
|
| 57 |
public void setDebug() { debug=true; }
|
|
| 58 |
|
|
| 59 |
String id; |
|
| 60 |
|
|
| 61 |
/** |
|
| 62 |
* Apply tt. |
|
| 63 |
* |
|
| 64 |
* @param ttsrcfile the ttsrcfile |
|
| 65 |
* @param ttoutfile the ttoutfile |
|
| 66 |
* @param modelfile the modelfile |
|
| 67 |
* @return true, if successful |
|
| 68 |
*/ |
|
| 69 |
public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile) |
|
| 70 |
{
|
|
| 71 |
try {
|
|
| 72 |
File infile = ttsrcfile; |
|
| 73 |
File outfile = ttoutfile; |
|
| 74 |
|
|
| 75 |
// TODO: why org.txm.utils.treetagger.TreeTagger tt = ..., throw IllegalAccessError ??? |
|
| 76 |
String opt = Toolbox.getPreference(Toolbox.TREETAGGER_OPTIONS); |
|
| 77 |
if (opt == null) opt = ""; |
|
| 78 |
String[] options = opt.split(" ");
|
|
| 79 |
def tt = new TreeTagger(Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/", options); |
|
| 80 |
tt.settoken(); |
|
| 81 |
tt.setlemma(); |
|
| 82 |
tt.setquiet(); |
|
| 83 |
tt.setsgml(); |
|
| 84 |
tt.setnounknown(); |
|
| 85 |
tt.seteostag("<s>");
|
|
| 86 |
tt.treetagger(modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath()) |
|
| 87 |
//infile.delete(); |
|
| 88 |
} |
|
| 89 |
catch(Exception e) |
|
| 90 |
{
|
|
| 91 |
e.printStackTrace(); |
|
| 92 |
return false; |
|
| 93 |
} |
|
| 94 |
return true; |
|
| 95 |
} |
|
| 96 |
|
|
| 97 |
/** |
|
| 98 |
* Run. |
|
| 99 |
* |
|
| 100 |
* @param rootDirFile the root dir file |
|
| 101 |
* @param modelfilename the modelfilename |
|
| 102 |
* @return true, if successful |
|
| 103 |
*/ |
|
| 104 |
public boolean run(File binDir, File txmDir, String modelfilename) |
|
| 105 |
{
|
|
| 106 |
//test if modelfile exists |
|
| 107 |
if(debug) {
|
|
| 108 |
println "rootDirFile "+binDir |
|
| 109 |
println "txmDir "+txmDir |
|
| 110 |
println "TREETAGGER INSTALL PATH : "+Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH) |
|
| 111 |
println "TREETAGGER MODELS PATH : "+Toolbox.getPreference(Toolbox.TREETAGGER_MODELS_PATH) |
|
| 112 |
} |
|
| 113 |
|
|
| 114 |
//test if the Toolbox know TreeTagger |
|
| 115 |
if (!new File(Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/").exists()) |
|
| 116 |
{
|
|
| 117 |
println("Could not find TreeTagger binaries in "+Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/")
|
|
| 118 |
return false; |
|
| 119 |
} |
|
| 120 |
|
|
| 121 |
//cleaning |
|
| 122 |
File annotDir = new File(binDir, "annotations") |
|
| 123 |
annotDir.mkdir(); |
|
| 124 |
|
|
| 125 |
//BUILD TT FILE READY TO BE TAGGED |
|
| 126 |
List<File> files = txmDir.listFiles() |
|
| 127 |
|
|
| 128 |
// get model file and check it |
|
| 129 |
File modelfile = new File(Toolbox.getPreference(Toolbox.TREETAGGER_MODELS_PATH), modelfilename); |
|
| 130 |
if (debug) |
|
| 131 |
println "model file : "+modelfile; |
|
| 132 |
|
|
| 133 |
if (!modelfile.exists()) {
|
|
| 134 |
println "Skipping ANNOTATE: Incorrect modelfile path: "+modelfile; |
|
| 135 |
if(System.getProperty("os.name").startsWith("Windows"))
|
|
| 136 |
println "Windows users: Windows might be hiding files extension. To see them, in the explorer parameters." |
|
| 137 |
return false; |
|
| 138 |
} |
|
| 139 |
|
|
| 140 |
//APPLY TREETAGGER |
|
| 141 |
println("Applying $modelfilename TreeTagger model on dir: "+txmDir+ " ("+files.size()+" files)")
|
|
| 142 |
if (files == null || files.size() == 0) |
|
| 143 |
return false; |
|
| 144 |
|
|
| 145 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size()); |
|
| 146 |
for (File f : files) {
|
|
| 147 |
cpb.tick(); |
|
| 148 |
|
|
| 149 |
File infile = f; |
|
| 150 |
File outfile = new File(new File(binDir, "annotations"), f.getName()); |
|
| 151 |
if (outfile.exists() && // outfile exists |
|
| 152 |
outfile.lastModified() >= infile.lastModified() && // outfile is more recent |
|
| 153 |
outfile.length() > infile.length()) { // outfile is bigger
|
|
| 154 |
// skip |
|
| 155 |
} else {
|
|
| 156 |
if (!applyTT(infile, outfile, modelfile)) {
|
|
| 157 |
System.out.println("Failed to apply treetagger on file "+f);
|
|
| 158 |
return false; |
|
| 159 |
} |
|
| 160 |
} |
|
| 161 |
} |
|
| 162 |
println("")
|
|
| 163 |
|
|
| 164 |
// if (DeleteDir.deleteDirectory(txmDir)) {
|
|
| 165 |
// if (annotDir.renameTo(txmDir)) {
|
|
| 166 |
// |
|
| 167 |
// } else {
|
|
| 168 |
// println "Could not rename 'annotations' directory to 'txm' directory" |
|
| 169 |
// return false |
|
| 170 |
// } |
|
| 171 |
// } else {
|
|
| 172 |
// println "Could not delete txmDir: $txmDir (and could not copy 'annotations' directory" |
|
| 173 |
// } |
|
| 174 |
|
|
| 175 |
return true; |
|
| 176 |
} |
|
| 177 |
} |
|
| tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/Annotate.groovy (revision 927) | ||
|---|---|---|
| 1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
| 2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
| 3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
| 4 |
// Sophia Antipolis, University of Paris 3. |
|
| 5 |
// |
|
| 6 |
// The TXM platform is free software: you can redistribute it |
|
| 7 |
// and/or modify it under the terms of the GNU General Public |
|
| 8 |
// License as published by the Free Software Foundation, |
|
| 9 |
// either version 2 of the License, or (at your option) any |
|
| 10 |
// later version. |
|
| 11 |
// |
|
| 12 |
// The TXM platform is distributed in the hope that it will be |
|
| 13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
| 14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
| 15 |
// PURPOSE. See the GNU General Public License for more |
|
| 16 |
// details. |
|
| 17 |
// |
|
| 18 |
// You should have received a copy of the GNU General |
|
| 19 |
// Public License along with the TXM platform. If not, see |
|
| 20 |
// http://www.gnu.org/licenses. |
|
| 21 |
// |
|
| 22 |
// |
|
| 23 |
// |
|
| 24 |
// $LastChangedDate: 2016-10-03 15:30:36 +0200 (lun. 03 oct. 2016) $ |
|
| 25 |
// $LastChangedRevision: 3313 $ |
|
| 26 |
// $LastChangedBy: mdecorde $ |
|
| 27 |
// |
|
| 28 |
package org.txm.importer.xmltxm |
|
| 29 |
|
|
| 30 |
import java.io.File |
|
| 31 |
import java.text.DateFormat |
|
| 32 |
import java.util.Date |
|
| 33 |
import java.util.concurrent.* |
|
| 34 |
|
|
| 35 |
import org.txm.Toolbox |
|
| 36 |
import org.txm.importer.* |
|
| 37 |
import org.txm.importer.cwb.* |
|
| 38 |
import org.txm.objects.* |
|
| 39 |
import org.txm.stat.utils.ConsoleProgressBar; |
|
| 40 |
import org.txm.treetagger.core.preferences.TreeTaggerPreferences |
|
| 41 |
import org.txm.utils.LangDetector; |
|
| 42 |
import org.txm.utils.logger.Log; |
|
| 43 |
import org.txm.utils.treetagger.TreeTagger |
|
| 44 |
|
|
| 45 |
/** |
|
| 46 |
* Annotate and replace the TEI-TXM files of the folder $rootDirFile/txm with TreeTagger. |
|
| 47 |
* creates $rootDirFile/interp and $rootDirFile/treetagger |
|
| 48 |
* |
|
| 49 |
*/ |
|
| 50 |
class Annotate {
|
|
| 51 |
boolean cancelNow = false; |
|
| 52 |
|
|
| 53 |
/** The report file. */ |
|
| 54 |
File reportFile;//contains the txm:application tag content |
|
| 55 |
|
|
| 56 |
/** The resp person. */ |
|
| 57 |
String respPerson; |
|
| 58 |
|
|
| 59 |
/** The resp id. */ |
|
| 60 |
String respId; |
|
| 61 |
|
|
| 62 |
/** The resp desc. */ |
|
| 63 |
String respDesc; |
|
| 64 |
|
|
| 65 |
/** The resp date. */ |
|
| 66 |
String respDate; |
|
| 67 |
|
|
| 68 |
/** The resp when. */ |
|
| 69 |
String respWhen; |
|
| 70 |
|
|
| 71 |
/** The app ident. */ |
|
| 72 |
String appIdent; |
|
| 73 |
|
|
| 74 |
/** The app version. */ |
|
| 75 |
String appVersion; |
|
| 76 |
|
|
| 77 |
/** The distributor. */ |
|
| 78 |
String distributor; |
|
| 79 |
|
|
| 80 |
/** The publi stmt. */ |
|
| 81 |
String publiStmt; |
|
| 82 |
|
|
| 83 |
/** The source stmt. */ |
|
| 84 |
String sourceStmt; |
|
| 85 |
|
|
| 86 |
/** The types. */ |
|
| 87 |
def types; |
|
| 88 |
|
|
| 89 |
/** The types title. */ |
|
| 90 |
def typesTITLE; |
|
| 91 |
|
|
| 92 |
/** The types desc. */ |
|
| 93 |
def typesDesc; |
|
| 94 |
|
|
| 95 |
/** The types tagset. */ |
|
| 96 |
def typesTAGSET; |
|
| 97 |
|
|
| 98 |
/** The types web. */ |
|
| 99 |
def typesWEB; |
|
| 100 |
|
|
| 101 |
/** The idform. */ |
|
| 102 |
String idform; |
|
| 103 |
|
|
| 104 |
/** The debug. */ |
|
| 105 |
boolean debug = false; |
|
| 106 |
|
|
| 107 |
File modelsDirectory; |
|
| 108 |
|
|
| 109 |
public Annotate() {
|
|
| 110 |
modelsDirectory = new File(TreeTaggerPreferences.getString(TreeTaggerPreferences.MODELS_PATH, TreeTaggerPreferences.PREFERENCES_NODE)); // default models directory is set in the Toolbox |
|
| 111 |
} |
|
| 112 |
|
|
| 113 |
/** |
|
| 114 |
* Sets the debug. |
|
| 115 |
*/ |
|
| 116 |
public void setDebug() {
|
|
| 117 |
debug = true; |
|
| 118 |
} |
|
| 119 |
|
|
| 120 |
String id; |
|
| 121 |
/** |
|
| 122 |
* Inits the tt outfile infos. |
|
| 123 |
* |
|
| 124 |
* @param rootDirFile the root dir file |
|
| 125 |
* @param modelfile the modelfile |
|
| 126 |
*/ |
|
| 127 |
public void initTTOutfileInfos(File rootDirFile, File modelfile, String modelfilename) |
|
| 128 |
{
|
|
| 129 |
initTTOutfileInfos(rootDirFile, modelfile, modelfilename, null); |
|
| 130 |
} |
|
| 131 |
|
|
| 132 |
/** |
|
| 133 |
* Inits the tt outfile infos. |
|
| 134 |
* |
|
| 135 |
* @param rootDirFile the root dir file |
|
| 136 |
* @param modelfile the modelfile |
|
| 137 |
* @param properties : 2 element array that contains the word properties to create. It can be null (the modelfilename will be used) |
|
| 138 |
*/ |
|
| 139 |
public void initTTOutfileInfos(File rootDirFile, File modelfile, String modelfilename, String[] properties) |
|
| 140 |
{
|
|
| 141 |
id = modelfilename; |
|
| 142 |
String[] split = id.split("\\.");
|
|
| 143 |
if (split.length > 0) id = split[0]; |
|
| 144 |
if (id.equals("??")) id = "xx"
|
|
| 145 |
|
|
| 146 |
reportFile = new File(rootDirFile,"NLPToolsParameters.xml"); |
|
| 147 |
|
|
| 148 |
respPerson = System.getProperty("user.name");
|
|
| 149 |
respId = "txm"; |
|
| 150 |
respDesc = "NLP annotation tool"; |
|
| 151 |
respDate = DateFormat.getDateInstance(DateFormat.SHORT, Locale.UK).format(new Date()); |
|
| 152 |
respWhen = DateFormat.getDateInstance(DateFormat.FULL, Locale.UK).format(new Date()); |
|
| 153 |
|
|
| 154 |
appIdent = "TreeTagger"; |
|
| 155 |
appVersion = "3.2"; |
|
| 156 |
|
|
| 157 |
distributor = ""; |
|
| 158 |
publiStmt = """"""; |
|
| 159 |
sourceStmt = """"""; |
|
| 160 |
|
|
| 161 |
if (properties != null && properties.length == 2) {
|
|
| 162 |
types = [properties[0],properties[1]]; |
|
| 163 |
typesTITLE = [properties[0],properties[1]]; |
|
| 164 |
} else {
|
|
| 165 |
types = [id+"pos",id+"lemma"]; |
|
| 166 |
typesTITLE = [id+"pos",id+"lemma"]; |
|
| 167 |
} |
|
| 168 |
|
|
| 169 |
//TODO: the tagset, website and description should be referenced in the model catalog |
|
| 170 |
if(modelfile.getName() == "rgaqcj.par") {
|
|
| 171 |
typesDesc = ["CATTEX pos tagset built with BFM texts","fr lemma of the model "+modelfile+" - "] |
|
| 172 |
typesTAGSET = ["http://bfm.ens-lyon.fr/IMG/pdf/Cattex2009_Manuel.pdf",""] |
|
| 173 |
typesWEB = ["http://bfm.ens-lyon.fr/",""] |
|
| 174 |
} else {
|
|
| 175 |
typesDesc = ["pos tagset built from model "+modelfile,id+" lemma of the model "+modelfile+" - "] |
|
| 176 |
typesTAGSET = ["",""] |
|
| 177 |
typesWEB = ["",""] |
|
| 178 |
} |
|
| 179 |
|
|
| 180 |
idform ="w"; |
|
| 181 |
} |
|
| 182 |
|
|
| 183 |
/** |
|
| 184 |
* Apply tt. |
|
| 185 |
* |
|
| 186 |
* @param ttsrcfile the ttsrcfile |
|
| 187 |
* @param ttoutfile the ttoutfile |
|
| 188 |
* @param modelfile the modelfile |
|
| 189 |
* @return true, if successful |
|
| 190 |
*/ |
|
| 191 |
public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile) {
|
|
| 192 |
applyTT(ttsrcfile, ttoutfile, modelfile, null) |
|
| 193 |
} |
|
| 194 |
|
|
| 195 |
/** |
|
| 196 |
* Apply tt. |
|
| 197 |
* |
|
| 198 |
* @param ttsrcfile the ttsrcfile |
|
| 199 |
* @param ttoutfile the ttoutfile |
|
| 200 |
* @param modelfile the modelfile |
|
| 201 |
* @param options, if null use value set in Toolbox preferences |
|
| 202 |
* @return true, if successful |
|
| 203 |
*/ |
|
| 204 |
public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile, String[] options) |
|
| 205 |
{
|
|
| 206 |
try {
|
|
| 207 |
File infile = ttsrcfile; |
|
| 208 |
File outfile = ttoutfile; |
|
| 209 |
|
|
| 210 |
// TODO: why org.txm.utils.treetagger.TreeTagger tt = ..., throw IllegalAccessError ??? |
|
| 211 |
String opt = TreeTaggerPreferences.getString(TreeTaggerPreferences.OPTIONS, TreeTaggerPreferences.PREFERENCES_NODE); |
|
| 212 |
if (options == null) {
|
|
| 213 |
if (opt == null) opt = ""; |
|
| 214 |
options = opt.split(" ");
|
|
| 215 |
} |
|
| 216 |
|
|
| 217 |
def tt = new TreeTagger(TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/", options); |
|
| 218 |
tt.settoken(); |
|
| 219 |
tt.setlemma(); |
|
| 220 |
tt.setquiet(); |
|
| 221 |
tt.setsgml(); |
|
| 222 |
tt.setnounknown(); |
|
| 223 |
tt.seteostag("<s>");
|
|
| 224 |
tt.debug(debug); |
|
| 225 |
tt.treetagger(modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath()) |
|
| 226 |
infile.delete(); |
|
| 227 |
} catch(Exception e) {
|
|
| 228 |
Log.printStackTrace(e); |
|
| 229 |
System.out.println("Failed to apply TreeTagger on $f input file with the $modelfile model file.");
|
|
| 230 |
return false; |
|
| 231 |
} |
|
| 232 |
return true; |
|
| 233 |
} |
|
| 234 |
|
|
| 235 |
/** |
|
| 236 |
* Write standoff file. |
|
| 237 |
* |
|
| 238 |
* @param ttoutfile the ttoutfile |
|
| 239 |
* @param posfile the posfile |
|
| 240 |
* @return true, if successful |
|
| 241 |
*/ |
|
| 242 |
public boolean writeStandoffFile(File ttoutfile, File posfile) |
|
| 243 |
{
|
|
| 244 |
def encoding ="UTF-8"; |
|
| 245 |
def transfo = new CSV2W_ANA(); |
|
| 246 |
//println("build w-interp "+ttfile.getName()+ ">>"+posfile.getName())
|
|
| 247 |
transfo.setAnnotationTypes( types, typesDesc, typesTAGSET, typesWEB, idform); |
|
| 248 |
transfo.setResp(respId, respDesc, respDate, respPerson, respWhen); |
|
| 249 |
transfo.setApp(appIdent, appVersion); |
|
| 250 |
transfo.setTarget(ttoutfile.getAbsolutePath(), reportFile); |
|
| 251 |
transfo.setInfos(distributor, publiStmt, sourceStmt); |
|
| 252 |
return transfo.process( ttoutfile, posfile, encoding ); |
|
| 253 |
} |
|
| 254 |
|
|
| 255 |
/** |
|
| 256 |
* Run step by step : build TT src files, run TT, build xml-standoff files, inject standoff annotations |
|
| 257 |
* |
|
| 258 |
* @param rootDirFile the root dir file |
|
| 259 |
* @param modelfilename the modelfilename |
|
| 260 |
* @return true, if successful |
|
| 261 |
*/ |
|
| 262 |
public boolean run(File binDir, File txmDir, String modelfilename) |
|
| 263 |
{
|
|
| 264 |
//test if modelfile exists |
|
| 265 |
if (debug) {
|
|
| 266 |
println "rootDirFile "+binDir |
|
| 267 |
println "txmDir "+txmDir |
|
| 268 |
println "TREETAGGER INSTALL PATH : "+TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE); |
|
| 269 |
println "TREETAGGER MODELS PATH : "+TreeTaggerPreferences.getString(TreeTaggerPreferences.MODELS_PATH, TreeTaggerPreferences.PREFERENCES_NODE) |
|
| 270 |
} |
|
| 271 |
|
|
| 272 |
//test if the Toolbox know TreeTagger |
|
| 273 |
if (!new File(TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/").exists()) {
|
|
| 274 |
println("Could not find TreeTagger binaries in "+TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/")
|
|
| 275 |
return false; |
|
| 276 |
} |
|
| 277 |
String langAll = null |
|
| 278 |
String lang; |
|
| 279 |
if (modelfilename.startsWith("??")) {
|
|
| 280 |
langAll = new LangDetector(binDir).getLang(); |
|
| 281 |
println "General lang $langAll" |
|
| 282 |
} |
|
| 283 |
|
|
| 284 |
//cleaning |
|
| 285 |
new File(binDir, "annotations").deleteDir(); |
|
| 286 |
new File(binDir, "annotations").mkdir(); |
|
| 287 |
new File(binDir, "treetagger").deleteDir(); |
|
| 288 |
new File(binDir, "treetagger").mkdir(); |
|
| 289 |
|
|
| 290 |
ArrayList<String> milestones = []; |
|
| 291 |
|
|
| 292 |
//BUILD TT FILE READY TO BE TAGGED |
|
| 293 |
List<File> files = txmDir.listFiles() |
|
| 294 |
|
|
| 295 |
println("Building TT source files ("+files.size()+") from directory "+txmDir)
|
|
| 296 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size()) |
|
| 297 |
for (File f : files) {
|
|
| 298 |
cpb.tick() |
|
| 299 |
File srcfile = f; |
|
| 300 |
File resultfile = new File(binDir, "treetagger/"+f.getName()+".tt"); |
|
| 301 |
if(debug) |
|
| 302 |
println "build tt src : "+srcfile+" >> "+resultfile |
|
| 303 |
def ttsrcbuilder = new BuildTTSrc(srcfile.toURI().toURL()) |
|
| 304 |
if (!ttsrcbuilder.process(resultfile, null)) |
|
| 305 |
System.out.println("Failed to build tt src file of "+srcfile);
|
|
| 306 |
} |
|
| 307 |
|
|
| 308 |
if (cancelNow) return; |
|
| 309 |
|
|
| 310 |
File modelDirectory = new File(TreeTaggerPreferences.getString(TreeTaggerPreferences.MODELS_PATH, TreeTaggerPreferences.PREFERENCES_NODE)); |
|
| 311 |
if (!modelDirectory.exists()) {
|
|
| 312 |
println "Skipping ANNOTATE: TreeTagger language model file directory not found: "+modelDirectory.getAbsolutePath(); |
|
| 313 |
return false; |
|
| 314 |
} else if (!modelDirectory.canRead()) {
|
|
| 315 |
println "Skipping ANNOTATE: impossible to access the TreeTagger language model file directory: "+modelDirectory.getAbsolutePath(); |
|
| 316 |
return false; |
|
| 317 |
} |
|
| 318 |
println("")
|
|
| 319 |
//Convert encoding if needed |
|
| 320 |
|
|
| 321 |
//APPLY TREETAGGER |
|
| 322 |
files = new File(binDir, "treetagger").listFiles() |
|
| 323 |
println("Applying $modelfilename TreeTagger model on dir: "+new File(binDir, "treetagger")+ " ("+files.size()+" files)")
|
|
| 324 |
if (files == null || files.size() == 0) |
|
| 325 |
return false; |
|
| 326 |
File modelfile; |
|
| 327 |
cpb = new ConsoleProgressBar(files.size()) |
|
| 328 |
for (File f : files) {
|
|
| 329 |
String tmpModelFileName = modelfilename |
|
| 330 |
if (modelfilename.startsWith("??")) {
|
|
| 331 |
lang = langAll; |
|
| 332 |
if (f.length() > LangDetector.MINIMALSIZE) {
|
|
| 333 |
lang = new LangDetector(f).getLang(); |
|
| 334 |
//println "guessing lang $f : $lang" |
|
| 335 |
} |
|
| 336 |
tmpModelFileName = lang+".par" |
|
| 337 |
} |
|
| 338 |
modelfile = new File(modelsDirectory, tmpModelFileName); |
|
| 339 |
if (debug) |
|
| 340 |
println "model file : "+modelfile; |
|
| 341 |
|
|
| 342 |
File |
|
| 343 |
|
|
| 344 |
if (!modelfile.exists()) {
|
|
| 345 |
println "Skipping ANNOTATE: '$modelfile' TreeTagger language model file not found." |
|
| 346 |
if(System.getProperty("os.name").startsWith("Windows") || System.getProperty("os.name").startsWith("Mac"))
|
|
| 347 |
println "Windows&Mac users: the operating system might be hiding file extensions. Use your file explorer to check the file name." |
|
| 348 |
return false; |
|
| 349 |
} else if (!modelfile.canRead()) {
|
|
| 350 |
println "Skipping ANNOTATE: impossible to access the '$modelfile' TreeTagger language model file." |
|
| 351 |
return false; |
|
| 352 |
} |
|
| 353 |
|
|
| 354 |
// if (modelfile.getName().equals("sp.par")) {//UTF >> Latin1
|
|
| 355 |
// if(debug) |
|
| 356 |
// println "fix encoding for model "+modelfile |
|
| 357 |
// new EncodingConverter(f, "UTF-8", "ISO-8859-1") |
|
| 358 |
// } |
|
| 359 |
|
|
| 360 |
cpb.tick() |
|
| 361 |
File infile = f; |
|
| 362 |
File outfile = new File(f.getParent(),f.getName()+"-out.tt"); |
|
| 363 |
if (!applyTT(infile, outfile, modelfile)) {
|
|
| 364 |
return false; |
|
| 365 |
} |
|
| 366 |
|
|
| 367 |
// //Reconvert encoding if needed |
|
| 368 |
// if (modelfile.getName().equals("sp.par")) {
|
|
| 369 |
// if(debug) |
|
| 370 |
// println "convert "+f+" latin1 >> UTF-8" |
|
| 371 |
// new EncodingConverter(f, "ISO-8859-1", "UTF-8") |
|
| 372 |
// } |
|
| 373 |
} |
|
| 374 |
println("")
|
|
| 375 |
|
|
| 376 |
if (cancelNow) return; |
|
| 377 |
|
|
| 378 |
//BUILD STAND-OFF FILES |
|
| 379 |
if (modelfile == null) {
|
|
| 380 |
println "no model applied" |
|
| 381 |
return false; |
|
| 382 |
} |
|
| 383 |
|
|
| 384 |
initTTOutfileInfos(binDir, modelfile, modelfilename); |
|
| 385 |
files = new File(binDir, "treetagger").listFiles()// now contains the result files of TT |
|
| 386 |
println "Building stdoff files ("+files.size()+") from dir:"+new File(binDir, "treetagger")+" to "+new File(binDir, "annotations");
|
|
| 387 |
if (files == null || files.size() == 0) |
|
| 388 |
return false; |
|
| 389 |
cpb = new ConsoleProgressBar(files.size()) |
|
| 390 |
for (File ttrezfile : files) {
|
|
| 391 |
cpb.tick() |
|
| 392 |
File annotfile = new File(binDir, "annotations/"+ttrezfile.getName()+"-STOFF.xml"); |
|
| 393 |
if (!writeStandoffFile(ttrezfile, annotfile)) |
|
| 394 |
println("Failed to build standoff file of "+ttrezfile);
|
|
| 395 |
} |
|
| 396 |
println("")
|
|
| 397 |
|
|
| 398 |
if (cancelNow) return; |
|
| 399 |
|
|
| 400 |
//INJECT ANNOTATIONS |
|
| 401 |
List<File> interpfiles = new File(binDir, "annotations").listFiles(); |
|
| 402 |
List<File> txmfiles = txmDir.listFiles(); |
|
| 403 |
if (txmfiles == null) {
|
|
| 404 |
println "No file to annotate in "+txmDir.getAbsolutePath() |
|
| 405 |
return false; |
|
| 406 |
} |
|
| 407 |
interpfiles.sort(); // same order |
|
| 408 |
txmfiles.sort(); //same order |
|
| 409 |
println "Injecting stdoff files ("+interpfiles.size()+") data from "+new File(binDir, "annotations")+ " to xml-txm files of "+txmDir;
|
|
| 410 |
if (interpfiles == null || interpfiles.size() == 0) |
|
| 411 |
return false; |
|
| 412 |
cpb = new ConsoleProgressBar(interpfiles.size()) |
|
| 413 |
for (int i = 0 ; i < interpfiles.size() ; i++) {
|
|
| 414 |
cpb.tick() |
|
| 415 |
File srcfile = txmfiles.get(i); |
|
| 416 |
File pos1file = interpfiles.get(i); |
|
| 417 |
File temp = File.createTempFile("Annotate", "temp", srcfile.getParentFile());
|
|
| 418 |
def builder = new AnnotationInjection(srcfile.toURI().toURL(), pos1file.toURI().toURL()); |
|
| 419 |
if (!builder.process(temp)) {
|
|
| 420 |
return false; |
|
| 421 |
} |
|
| 422 |
builder = null; |
|
| 423 |
|
|
| 424 |
//println "renaming files..." |
|
| 425 |
if (!(srcfile.delete() && temp.renameTo(srcfile))) |
|
| 426 |
println "Warning can't rename file "+temp+" to "+srcfile |
|
| 427 |
} |
|
| 428 |
println("")
|
|
| 429 |
return true; |
|
| 430 |
} |
|
| 431 |
|
|
| 432 |
public void setModelsDirectory(File modelsDirectory) {
|
|
| 433 |
this.modelsDirectory = modelsDirectory; |
|
| 434 |
} |
|
| 435 |
|
|
| 436 |
/** |
|
| 437 |
* Run file by file. Allow to have one different lang per file. Default behavior add new word properties |
|
| 438 |
* |
|
| 439 |
* @param binDir |
|
| 440 |
* @param txmDir |
|
| 441 |
* @param lang associate a file name with a model filename |
|
| 442 |
* @return true, if successful |
|
| 443 |
*/ |
|
| 444 |
public boolean run(File binDir, File txmDir, HashMap<String, String> langs) |
|
| 445 |
{
|
|
| 446 |
return run(binDir, txmDir, langs, false, new String[0], new String[0]); |
|
| 447 |
} |
|
| 448 |
|
|
| 449 |
/** |
|
| 450 |
* Run file by file. Allow to have one different lang per file |
|
| 451 |
* |
|
| 452 |
* @param binDir |
|
| 453 |
* @param txmDir |
|
| 454 |
* @param lang associate a file name with a model filename |
|
| 455 |
* @param replace, replace or create a word property |
|
| 456 |
* @return true, if successful |
|
| 457 |
*/ |
|
| 458 |
public boolean run(File binDir, File txmDir, HashMap<String, String> langs, boolean replace, String[] properties, String[] options) |
|
| 459 |
{
|
|
| 460 |
if (!new File(TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/").exists()) {
|
|
| 461 |
println("Path to TreeTagger is wrong "+TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/")
|
|
| 462 |
return true; |
|
| 463 |
} |
|
| 464 |
|
|
| 465 |
List<File> listfiles = txmDir.listFiles(); |
|
| 466 |
|
|
| 467 |
//cleaning |
|
| 468 |
File annotDir = new File(binDir,"annotations"); |
|
| 469 |
annotDir.deleteDir(); |
|
| 470 |
annotDir.mkdir(); |
|
| 471 |
File ptreetaggerDir = new File(binDir,"ptreetagger"); |
|
| 472 |
ptreetaggerDir.deleteDir(); |
|
| 473 |
ptreetaggerDir.mkdir(); |
|
| 474 |
File treetaggerDir = new File(binDir,"treetagger"); |
|
| 475 |
treetaggerDir.deleteDir(); |
|
| 476 |
treetaggerDir.mkdir(); |
|
| 477 |
|
|
| 478 |
int cores = Runtime.getRuntime().availableProcessors() |
|
| 479 |
int coresToUse = Math.max(1.0, cores * 0.7) |
|
| 480 |
ExecutorService pool = Executors.newFixedThreadPool(coresToUse) |
|
| 481 |
|
|
| 482 |
def files = txmDir.listFiles() |
|
| 483 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size()) |
|
| 484 |
for (File teiFile : files) {
|
|
| 485 |
int counter = 1; |
|
| 486 |
ThreadFile t = new ThreadFile("TT_"+counter++, teiFile) {
|
|
| 487 |
|
|
| 488 |
public void run() {
|
|
| 489 |
|
|
| 490 |
if (cancelNow) return; |
|
| 491 |
if (langs.get(f.getName()) == null) {
|
|
| 492 |
println "Error: no lang defined for file $f" |
|
| 493 |
return; |
|
| 494 |
} |
|
| 495 |
|
|
| 496 |
String lang = langs.get(f.getName()); |
|
| 497 |
run(f, lang, binDir, txmDir, replace, properties, options, annotDir, ptreetaggerDir, treetaggerDir) |
|
| 498 |
|
|
| 499 |
cpb.tick(); |
|
| 500 |
} |
|
| 501 |
}; |
|
| 502 |
|
|
| 503 |
pool.execute(t) |
|
| 504 |
} |
|
| 505 |
|
|
| 506 |
pool.shutdown() |
|
| 507 |
pool.awaitTermination(10, TimeUnit.HOURS) |
|
| 508 |
println "" |
|
| 509 |
return true; |
|
| 510 |
} |
|
| 511 |
public boolean run(File f, String lang, File binDir, File txmDir) {
|
|
| 512 |
|
|
| 513 |
File annotDir = new File(binDir,"annotations"); |
|
| 514 |
annotDir.mkdir(); |
|
| 515 |
File ptreetaggerDir = new File(binDir,"ptreetagger"); |
|
| 516 |
ptreetaggerDir.mkdir(); |
|
| 517 |
File treetaggerDir = new File(binDir,"treetagger"); |
|
| 518 |
treetaggerDir.mkdir(); |
|
| 519 |
|
|
| 520 |
return run(f, lang, binDir, txmDir, false, new String[0], new String[0], annotDir, ptreetaggerDir, treetaggerDir) |
|
| 521 |
} |
|
| 522 |
|
|
| 523 |
|
|
| 524 |
public boolean run(File f, String lang, File binDir, File txmDir, boolean replace, String[] properties, String[] options, File annotDir, File ptreetaggerDir, File treetaggerDir) {
|
|
| 525 |
|
|
| 526 |
File modelfile = new File(modelsDirectory, lang+".par"); |
|
| 527 |
if (!modelfile.exists()) {
|
|
| 528 |
println "Error: No Modelfile available for lang "+modelfile+". Continue import process "; |
|
| 529 |
return false; |
|
| 530 |
} |
|
| 531 |
File annotfile = new File(annotDir, f.getName()+"-STDOFF.xml"); |
|
| 532 |
File ttsrcfile = new File(ptreetaggerDir, f.getName()+"-src.tt"); |
|
| 533 |
File ttrezfile = new File(treetaggerDir, f.getName()+"-out.tt"); |
|
| 534 |
//println ("TT with $model "+f+"+"+annotfile+" > "+ttsrcfile+" > "+ttrezfile);
|
|
| 535 |
|
|
| 536 |
//BUILD TT FILE READY TO BE TAGGED |
|
| 537 |
def builder = new BuildTTSrc(f.toURL()); |
|
| 538 |
builder.process(ttsrcfile, null); |
|
| 539 |
|
|
| 540 |
//Apply TT |
|
| 541 |
applyTT(ttsrcfile, ttrezfile, modelfile, options); |
|
| 542 |
|
|
| 543 |
//CREATE STANDOFF FILES |
|
| 544 |
initTTOutfileInfos(binDir, modelfile, lang, properties); |
|
| 545 |
writeStandoffFile(ttrezfile, annotfile) |
|
| 546 |
|
|
| 547 |
//INJECT ANNOTATIONS |
|
| 548 |
File tmpFile = new File(txmDir, "temp_"+f.getName()) |
|
| 549 |
builder = new AnnotationInjection(f.toURL(), annotfile.toURL(), replace); |
|
| 550 |
builder.process(tmpFile); |
|
| 551 |
if (!(f.delete() && tmpFile.renameTo(f))) println "Warning can't rename file "+tmpFile+" to "+f |
|
| 552 |
|
|
| 553 |
return f.exists(); |
|
| 554 |
} |
|
| 555 |
|
|
| 556 |
public void setCancelNow() {
|
|
| 557 |
cancelNow = true; |
|
| 558 |
} |
|
| 559 |
|
|
| 560 |
public class ThreadFile extends Thread {
|
|
| 561 |
File f; |
|
| 562 |
public ThreadFile(String name, File f) {
|
|
| 563 |
this.setName(name) |
|
| 564 |
this.f = f; |
|
| 565 |
} |
|
| 566 |
} |
|
| 567 |
} |
|
| tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/AnnotationInjectionFromTTOutput.groovy (revision 927) | ||
|---|---|---|
| 1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
| 2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
| 3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
| 4 |
// Sophia Antipolis, University of Paris 3. |
|
| 5 |
// |
|
| 6 |
// The TXM platform is free software: you can redistribute it |
|
| 7 |
// and/or modify it under the terms of the GNU General Public |
|
| 8 |
// License as published by the Free Software Foundation, |
|
| 9 |
// either version 2 of the License, or (at your option) any |
|
| 10 |
// later version. |
|
| 11 |
// |
|
| 12 |
// The TXM platform is distributed in the hope that it will be |
|
| 13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
| 14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
| 15 |
// PURPOSE. See the GNU General Public License for more |
|
| 16 |
// details. |
|
| 17 |
// |
|
| 18 |
// You should have received a copy of the GNU General |
|
| 19 |
// Public License along with the TXM platform. If not, see |
|
| 20 |
// http://www.gnu.org/licenses. |
|
| 21 |
// |
|
| 22 |
// |
|
| 23 |
// |
|
| 24 |
// $LastChangedDate: 2016-03-02 14:17:06 +0100 (mer., 02 mars 2016) $ |
|
| 25 |
// $LastChangedRevision: 3134 $ |
|
| 26 |
// $LastChangedBy: mdecorde $ |
|
| 27 |
// |
|
| 28 |
package org.txm.importer.xmltxm |
|
| 29 |
import javax.xml.stream.* |
|
| 30 |
|
|
| 31 |
import org.txm.importer.StaxIdentityParser |
|
| 32 |
import org.txm.importer.filters.* |
|
| 33 |
import org.txm.importer.PersonalNamespaceContext |
|
| 34 |
|
|
| 35 |
/** |
|
| 36 |
* The Class AnnotationInjection. |
|
| 37 |
* |
|
| 38 |
* @author mdecorde |
|
| 39 |
* |
|
| 40 |
* inject annotation from a stand-off file into a xml-tei-txm file |
|
| 41 |
*/ |
|
| 42 |
|
|
| 43 |
public class AnnotationInjectionFromTTOutput extends StaxIdentityParser {
|
|
| 44 |
|
|
| 45 |
public static String TXMNS = "http://textometrie.org/1.0" |
|
| 46 |
|
|
| 47 |
/** The xml reader factory. */ |
|
| 48 |
private def factory; |
|
| 49 |
|
|
| 50 |
/** The links. */ |
|
| 51 |
private List<String> links; |
|
| 52 |
|
|
| 53 |
/** The linkparsers. key=type*/ |
|
| 54 |
private HashMap<String, XMLStreamReader> linkparsers; |
|
| 55 |
|
|
| 56 |
/** The anaurl. */ |
|
| 57 |
private def anaurl; |
|
| 58 |
|
|
| 59 |
/** The anainput data. */ |
|
| 60 |
private def anainputData; |
|
| 61 |
|
|
| 62 |
/** The anafactory. */ |
|
| 63 |
private XMLInputFactory anafactory = XMLInputFactory.newInstance(); |
|
| 64 |
|
|
| 65 |
/** The anaparser. */ |
|
| 66 |
private XMLStreamReader anaparser; |
|
| 67 |
private XMLStreamReader headerparser; |
|
| 68 |
|
|
| 69 |
/** The resp stmt id. */ |
|
| 70 |
String respStmtID = ""; |
|
| 71 |
|
|
| 72 |
/** The present taxonomies. */ |
|
| 73 |
ArrayList<String> presentTaxonomies = new ArrayList(); |
|
| 74 |
|
|
| 75 |
/** |
|
| 76 |
* Instantiates a new annotation injection. |
|
| 77 |
* |
|
| 78 |
* @param url the xml-tei-txm file |
|
| 79 |
* @param anaurl the stand-off file |
|
| 80 |
*/ |
|
| 81 |
public AnnotationInjectionFromTTOutput(URL url, URL anaurl) {
|
|
| 82 |
super(url); // init reader and writer |
|
| 83 |
try {
|
|
| 84 |
this.anaurl = anaurl; |
|
| 85 |
factory = XMLInputFactory.newInstance(); |
|
| 86 |
this.buildLinkParsers();// build a parser per linkgroup |
|
| 87 |
} catch (XMLStreamException ex) {
|
|
| 88 |
System.out.println(ex); |
|
| 89 |
} catch (IOException ex) {
|
|
| 90 |
System.out.println("IOException while parsing ");
|
|
| 91 |
} |
|
| 92 |
} |
|
| 93 |
|
|
| 94 |
private void getHeaderInfos(String containertag, boolean captureTheTag) |
|
| 95 |
{
|
|
| 96 |
anainputData = new BufferedInputStream(anaurl.openStream()); |
|
| 97 |
headerparser = anafactory.createXMLStreamReader(anainputData); |
|
| 98 |
boolean start = false; |
|
| 99 |
String localname; |
|
| 100 |
for (int event = headerparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = headerparser.next()) |
|
| 101 |
{
|
|
| 102 |
//String prefix = headerparser.getNamespaceURI(); |
|
| 103 |
if (event == XMLStreamConstants.START_ELEMENT) {
|
|
| 104 |
localname = headerparser.getLocalName(); |
|
| 105 |
if (captureTheTag && localname == containertag) // start copy after the tag |
|
| 106 |
start = true; |
|
| 107 |
if (start) { // copy header
|
|
| 108 |
String prefix = headerparser.getPrefix(); |
|
| 109 |
if (prefix.length() > 0) |
|
| 110 |
writer.writeStartElement(Nscontext.getNamespaceURI(prefix), localname) |
|
| 111 |
else |
|
| 112 |
writer.writeStartElement(localname); |
|
| 113 |
for (int i = 0 ; i < headerparser.getNamespaceCount(); i++) |
|
| 114 |
writer.writeNamespace(headerparser.getNamespacePrefix(i), headerparser.getNamespaceURI(i)); |
|
| 115 |
for (int i = 0 ; i < headerparser.getAttributeCount(); i++) |
|
| 116 |
writer.writeAttribute(headerparser.getAttributeLocalName(i), headerparser.getAttributeValue(i)); |
|
| 117 |
} |
|
| 118 |
if (!captureTheTag && localname == containertag) // start copy after the tag |
|
| 119 |
start = true; |
|
| 120 |
} else if (event == XMLStreamConstants.END_ELEMENT) {
|
|
| 121 |
localname = headerparser.getLocalName(); |
|
| 122 |
if (!captureTheTag && localname == containertag) |
|
| 123 |
break;// stop looping |
|
| 124 |
|
|
| 125 |
if (start) |
|
| 126 |
writer.writeEndElement(); |
|
| 127 |
|
|
| 128 |
if (captureTheTag && localname == containertag) |
|
| 129 |
break;// stop looping |
|
| 130 |
} else if (event == XMLStreamConstants.CHARACTERS) {
|
|
| 131 |
if (start) |
|
| 132 |
writer.writeCharacters(headerparser.getText()); |
|
| 133 |
} else if (event == XMLStreamConstants.COMMENT) {
|
|
| 134 |
if (start) |
|
| 135 |
writer.writeComment(headerparser.getText()); |
|
| 136 |
} |
|
| 137 |
} |
|
| 138 |
headerparser.close(); |
|
| 139 |
} |
|
| 140 |
|
|
| 141 |
/** |
|
| 142 |
* find all refs. |
|
| 143 |
* |
|
| 144 |
* @return the list of link parser |
|
| 145 |
*/ |
|
| 146 |
private List<String> findGrpLink() |
|
| 147 |
{
|
|
| 148 |
ArrayList<String> links = new ArrayList<String>(); |
|
| 149 |
anainputData = anaurl.openStream(); |
|
| 150 |
anaparser = anafactory.createXMLStreamReader(anainputData); |
|
| 151 |
|
|
| 152 |
for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
|
|
| 153 |
if (event == XMLStreamConstants.START_ELEMENT) {
|
|
| 154 |
if (anaparser.getLocalName().matches("linkGrp")) {
|
|
| 155 |
String targetsvalue = anaparser.getAttributeValue(0) |
|
| 156 |
|
|
| 157 |
if (links.contains(targetsvalue)) {
|
|
| 158 |
System.err.println("Warning: Multiple group declaration : "+targetsvalue+" has already been added, the first one will be used")
|
|
| 159 |
} else {
|
|
| 160 |
links.add(targetsvalue); // add the taxonomy type |
|
| 161 |
} |
|
| 162 |
} else if (anaparser.getLocalName().matches("respStmt")) {
|
|
| 163 |
respStmtID = anaparser.getAttributeValue(0); // one attribute (id) only |
|
| 164 |
} |
|
| 165 |
} |
|
| 166 |
} |
|
| 167 |
anaparser.close(); |
|
| 168 |
return links; |
|
| 169 |
} |
|
| 170 |
|
|
| 171 |
/** |
|
| 172 |
* Builds the link parsers. |
|
| 173 |
* I need to know what groups exists to build a parser per taxonomy and go to the first link element |
|
| 174 |
*/ |
|
| 175 |
private void buildLinkParsers() |
|
| 176 |
{
|
|
| 177 |
// link group of the standoff file |
|
| 178 |
links = findGrpLink(); |
|
| 179 |
linkparsers = new HashMap<String, XMLStreamReader>(); |
|
| 180 |
|
|
| 181 |
// build one parser per link group |
|
| 182 |
for (String link : links) { // build a parser per group
|
|
| 183 |
anainputData = new BufferedInputStream(anaurl.openStream()); |
|
| 184 |
linkparsers.put(link, anafactory.createXMLStreamReader(anainputData)); |
|
| 185 |
} |
|
| 186 |
|
|
| 187 |
//for each parser |
|
| 188 |
for (String link : links) {
|
|
| 189 |
anaparser = linkparsers.get(link); |
|
| 190 |
for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
|
|
| 191 |
if (event == XMLStreamConstants.START_ELEMENT) {
|
|
| 192 |
if (anaparser.getLocalName().equals("linkGrp")) { // position the parser to the right group
|
|
| 193 |
String targetsvalue = anaparser.getAttributeValue(0) |
|
| 194 |
if (targetsvalue.equals(link)) |
|
| 195 |
break; // next element is a link start tag |
|
| 196 |
} |
|
| 197 |
} |
|
| 198 |
} |
|
| 199 |
} |
|
| 200 |
} |
|
| 201 |
|
|
| 202 |
/** |
|
| 203 |
* get the next tei:link value of a tei:LinkGrp. |
|
| 204 |
* |
|
| 205 |
* @param link the link |
|
| 206 |
* @return the next ana |
|
| 207 |
*/ |
|
| 208 |
private String getNextAnaValue(String link, String wordId) |
|
| 209 |
{
|
|
| 210 |
anaparser = linkparsers.get(link); |
|
| 211 |
def m; |
|
| 212 |
for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
|
|
| 213 |
if (event == XMLStreamConstants.START_ELEMENT) {
|
|
| 214 |
if (anaparser.getLocalName().matches("link")) {
|
|
| 215 |
String targetsvalue = anaparser.getAttributeValue(0) |
|
| 216 |
if ((m = targetsvalue =~ /#(.*) #(.*)/)) { // balise externe
|
|
| 217 |
def g1 = m[0][1]; |
|
| 218 |
def g2 = m[0][2]; |
|
| 219 |
|
|
| 220 |
String anavalue = g2; |
|
| 221 |
anavalue = anavalue.replace("<", "<")
|
|
| 222 |
return anavalue; |
|
| 223 |
} else {
|
|
| 224 |
System.err.println("Error: getNextAna(): link target is not well formed: = "+anaparser.getAttributeValue(0));
|
|
| 225 |
} |
|
| 226 |
} |
|
| 227 |
} |
|
| 228 |
} |
|
| 229 |
return ""; |
|
| 230 |
} |
|
| 231 |
|
|
| 232 |
/** |
|
| 233 |
* build the ana tags of a word. |
|
| 234 |
* |
|
| 235 |
* @param wordId the word id |
|
| 236 |
* @return the ana tag |
|
| 237 |
*/ |
|
| 238 |
private void writeAnaTag(String wordId) |
|
| 239 |
{
|
|
| 240 |
String anabalises ="\n"; |
|
| 241 |
for (String link : links) {
|
|
| 242 |
writer.writeStartElement(TXMNS, "ana"); |
|
| 243 |
writer.writeAttribute("resp", "#"+respStmtID);
|
|
| 244 |
writer.writeAttribute("type", "#"+link);
|
|
| 245 |
writer.writeCharacters(getNextAnaValue(link, wordId)); |
|
| 246 |
writer.writeEndElement(); // txm:ana |
|
| 247 |
} |
|
| 248 |
} |
|
| 249 |
|
|
| 250 |
String wordId; |
|
| 251 |
boolean flagSourceDesc = false; |
|
| 252 |
protected void processStartElement() |
|
| 253 |
{
|
|
| 254 |
if (parser.getLocalName().matches("taxonomy")) {
|
|
| 255 |
String taxo = parser.getAttributeValue(0) // taxonomy type |
|
| 256 |
presentTaxonomies.add(taxo); |
|
| 257 |
} |
|
| 258 |
|
|
| 259 |
for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
|
|
| 260 |
if (parser.getAttributeLocalName(i) == "id") {
|
|
| 261 |
wordId = parser.getAttributeValue(i); |
|
| 262 |
break |
|
| 263 |
} |
|
| 264 |
} |
|
| 265 |
super.processStartElement(); |
|
| 266 |
} |
|
| 267 |
|
|
| 268 |
boolean applicationWritten = false; |
|
| 269 |
boolean taxonomiesWritten = false; |
|
| 270 |
protected void processEndElement() {
|
|
| 271 |
switch (parser.getLocalName()) {
|
|
| 272 |
case "w": |
|
| 273 |
writeAnaTag(wordId); |
|
| 274 |
break; |
|
| 275 |
|
|
| 276 |
case "appInfo": |
|
| 277 |
applicationWritten = true; |
|
| 278 |
getHeaderInfos("appInfo", false);
|
|
| 279 |
break; |
|
| 280 |
|
|
| 281 |
case "classDecl": |
|
| 282 |
taxonomiesWritten = true; |
|
| 283 |
getHeaderInfos("classDecl", false);
|
|
| 284 |
break; |
|
| 285 |
|
|
| 286 |
case "encodingDesc": |
|
| 287 |
if (!applicationWritten) {
|
|
| 288 |
writer.writeStartElement("appInfo");
|
|
| 289 |
getHeaderInfos("appInfo", false);
|
|
| 290 |
writer.writeEndElement(); // appInfo |
|
| 291 |
} |
|
| 292 |
if (!taxonomiesWritten) {
|
|
| 293 |
writer.writeStartElement("classDecl");
|
|
| 294 |
getHeaderInfos("classDecl", false);
|
|
| 295 |
writer.writeEndElement(); // classDecl |
|
| 296 |
} |
|
| 297 |
break; |
|
| 298 |
|
|
| 299 |
case "titleStmt": |
|
| 300 |
if (flagSourceDesc) {
|
|
| 301 |
//output.write(this.respStmt+"\n") |
|
| 302 |
getHeaderInfos("respStmt", true);
|
|
| 303 |
flagSourceDesc = false; |
|
| 304 |
break; |
|
| 305 |
} |
|
| 306 |
break; |
|
| 307 |
} |
|
| 308 |
super.processEndElement(); |
|
| 309 |
} |
|
| 310 |
|
|
| 311 |
/** The declarenamespace. */ |
|
| 312 |
boolean declarenamespace = false; |
|
| 313 |
|
|
| 314 |
/** |
|
| 315 |
* Declare namespace. |
|
| 316 |
* |
|
| 317 |
* @return the java.lang. object |
|
| 318 |
*/ |
|
| 319 |
private declareNamespace() {
|
|
| 320 |
if (!declarenamespace) {
|
|
| 321 |
writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0");
|
|
| 322 |
writer.writeNamespace("txm", TXMNS);
|
|
| 323 |
declarenamespace = true; |
|
| 324 |
} |
|
| 325 |
} |
|
| 326 |
|
|
| 327 |
/** |
|
| 328 |
* The main method. |
|
| 329 |
* |
|
| 330 |
* @param args the arguments |
|
| 331 |
*/ |
|
| 332 |
public static void main(String[] args) {
|
|
| 333 |
|
|
| 334 |
String rootDir = "~/xml/rgaqcj/"; |
|
| 335 |
new File(rootDir + "/injection/").mkdir(); |
|
| 336 |
|
|
| 337 |
def milestones = [ |
|
| 338 |
"tagUsage", |
|
| 339 |
"pb", |
|
| 340 |
"lb", |
|
| 341 |
"catRef"]// the tags who |
|
| 342 |
File srcfile = new File(rootDir, "/anainline/", "roland.xml"); |
|
| 343 |
File pos1file = new File(rootDir, "/pos/", "rolandTT1-w-ana.xml"); |
|
| 344 |
|
|
| 345 |
File src2file = new File(rootDir, "/injection/", "roland.xml"); |
|
| 346 |
File pos2file = new File(rootDir, "/pos/", "rolandTT2-w-ana.xml"); |
|
| 347 |
|
|
| 348 |
println("process file : " + srcfile + " with : " + pos1file);
|
|
| 349 |
def builder = new AnnotationInjectionFromTTOutput(srcfile.toURI().toURL(), |
|
| 350 |
pos1file.toURI().toURL(), milestones); |
|
| 351 |
builder.transfomFile(new File(rootDir + "/injection/", "roland.xml")); |
|
| 352 |
|
|
| 353 |
println("process file : " + src2file + " with : " + pos1file);
|
|
| 354 |
builder = new AnnotationInjectionFromTTOutput(src2file.toURI().toURL(), pos2file.toURI().toURL(), |
|
| 355 |
milestones); |
|
| 356 |
builder.transfomFile(rootDir + "/injection/", "roland-FINAL.xml"); |
|
| 357 |
|
|
| 358 |
return; |
|
| 359 |
} |
|
| 360 |
} |
|
| tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/package.html (revision 927) | ||
|---|---|---|
| 1 |
<html> |
|
| 2 |
<body> |
|
| 3 |
<p>Manages xml-tei-txm files</p> |
|
| 4 |
</body> |
|
| 5 |
</html> |
|
| 0 | 6 | |
| tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/BuildTTSrc.groovy (revision 927) | ||
|---|---|---|
| 1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
| 2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
| 3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
| 4 |
// Sophia Antipolis, University of Paris 3. |
|
| 5 |
// |
|
| 6 |
// The TXM platform is free software: you can redistribute it |
|
| 7 |
// and/or modify it under the terms of the GNU General Public |
|
| 8 |
// License as published by the Free Software Foundation, |
|
| 9 |
// either version 2 of the License, or (at your option) any |
|
| 10 |
// later version. |
|
| 11 |
// |
|
| 12 |
// The TXM platform is distributed in the hope that it will be |
|
| 13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
| 14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
| 15 |
// PURPOSE. See the GNU General Public License for more |
|
| 16 |
// details. |
|
| 17 |
// |
|
| 18 |
// You should have received a copy of the GNU General |
|
| 19 |
// Public License along with the TXM platform. If not, see |
|
| 20 |
// http://www.gnu.org/licenses. |
|
| 21 |
// |
|
| 22 |
// |
|
| 23 |
// |
|
| 24 |
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $ |
|
| 25 |
// $LastChangedRevision: 3400 $ |
|
| 26 |
// $LastChangedBy: mdecorde $ |
|
| 27 |
// |
|
| 28 |
package org.txm.importer.xmltxm |
|
| 29 |
|
|
| 30 |
import java.text.DateFormat; |
|
| 31 |
import java.util.Date; |
|
| 32 |
import java.util.ArrayList; |
|
| 33 |
import javax.xml.stream.*; |
|
| 34 |
import java.net.URL; |
|
| 35 |
|
|
| 36 |
import org.txm.Toolbox; |
|
| 37 |
import org.txm.importer.filters.*; |
|
| 38 |
import org.txm.treetagger.core.preferences.TreeTaggerPreferences |
|
| 39 |
|
|
| 40 |
// TODO: Auto-generated Javadoc |
|
| 41 |
/** |
|
| 42 |
* The Class BuildTTSrc. |
|
| 43 |
* |
|
| 44 |
* @author mdecorde |
|
| 45 |
* build the TT source for tigerSearch |
|
| 46 |
*/ |
|
| 47 |
|
|
| 48 |
public class BuildTTSrc {
|
|
| 49 |
|
|
| 50 |
/** The url. */ |
|
| 51 |
private def url; |
|
| 52 |
|
|
| 53 |
/** The input data. */ |
|
| 54 |
private def inputData; |
|
| 55 |
|
|
| 56 |
/** The factory. */ |
|
| 57 |
private def factory; |
|
| 58 |
|
|
| 59 |
/** The parser. */ |
|
| 60 |
private XMLStreamReader parser; |
|
| 61 |
|
|
| 62 |
/** The output. */ |
|
| 63 |
private BufferedWriter output; |
|
| 64 |
|
|
| 65 |
/** |
|
| 66 |
* Instantiates a new builds the tt src. |
|
| 67 |
* uses XML-TXM V2 |
|
| 68 |
* |
|
| 69 |
* @param url the url of the file to process |
|
| 70 |
*/ |
|
| 71 |
public BuildTTSrc(URL url) {
|
|
| 72 |
try {
|
|
| 73 |
this.url = url; |
|
| 74 |
inputData = url.openStream(); |
|
| 75 |
factory = XMLInputFactory.newInstance(); |
|
| 76 |
parser = factory.createXMLStreamReader(inputData); |
|
| 77 |
|
|
| 78 |
} catch (XMLStreamException ex) {
|
|
| 79 |
System.out.println(ex); |
|
| 80 |
} catch (IOException ex) {
|
|
| 81 |
System.out.println("IOException while parsing ");
|
|
| 82 |
} |
|
| 83 |
} |
|
| 84 |
|
|
| 85 |
/** |
|
| 86 |
* Creates the output. |
|
| 87 |
* |
|
| 88 |
* @param outfile the outfile |
|
| 89 |
* @return true, if successful |
|
| 90 |
*/ |
|
| 91 |
private boolean createOutput(File outfile) {
|
|
| 92 |
try {
|
|
| 93 |
File f = outfile; |
|
| 94 |
output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), |
|
| 95 |
"UTF-8")); |
|
| 96 |
return true; |
|
| 97 |
} catch (Exception e) {
|
|
| 98 |
System.out.println(e.getLocalizedMessage()); |
|
| 99 |
return false; |
|
| 100 |
} |
|
| 101 |
} |
|
| 102 |
|
|
| 103 |
/** |
|
| 104 |
* Process. |
|
| 105 |
* |
|
| 106 |
* @param outfile the outfile |
|
| 107 |
* @param formtype, if multiple form, use this param to choose the correct one, if null takes the first form found |
|
| 108 |
* @return true, if successful |
|
| 109 |
*/ |
|
| 110 |
public boolean process(File outfile, String formtype) {
|
|
| 111 |
if (!createOutput(outfile)) |
|
| 112 |
return false; |
|
| 113 |
|
|
| 114 |
boolean flagform = false; // to catch the content of the form tag |
|
| 115 |
boolean firstform = false; // to know if its the first form of the w element |
|
| 116 |
String form = ""; // the content of the form tag |
|
| 117 |
String lastopenlocalname = ""; |
|
| 118 |
String localname = ""; |
|
| 119 |
StringBuffer buffer = new StringBuffer(); |
|
| 120 |
try {
|
|
| 121 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
|
|
| 122 |
switch (event) {
|
|
| 123 |
case XMLStreamConstants.START_ELEMENT: |
|
| 124 |
localname = parser.getLocalName(); |
|
| 125 |
switch (localname) {
|
|
| 126 |
case "w": |
|
| 127 |
//firstform = true; |
|
| 128 |
break; |
|
| 129 |
case "form": |
|
| 130 |
// if (firstform) {
|
|
| 131 |
// if (formtype != null) {
|
|
| 132 |
// if(parser.getAttributeCount() > 0 |
|
| 133 |
// && parser.getAttributeValue(0).equals(formtype)) // only one attribute in form, type |
|
| 134 |
// flagform = true; |
|
| 135 |
// } |
|
| 136 |
// else |
|
| 137 |
flagform = true; |
|
| 138 |
form = ""; |
|
| 139 |
firstform = false; |
|
| 140 |
//} |
|
| 141 |
break; |
|
| 142 |
case "s": // TreeTagger can use s tags |
|
| 143 |
buffer.append("<s>\n");
|
|
| 144 |
break; |
|
| 145 |
} |
|
| 146 |
break; |
|
| 147 |
case XMLStreamConstants.END_ELEMENT: |
|
| 148 |
localname = parser.getLocalName(); |
|
| 149 |
switch (localname) {
|
|
| 150 |
case "form": |
|
| 151 |
flagform = false; |
|
| 152 |
form = form.trim() |
|
| 153 |
if (form.length() == 0) buffer.append("__EMPTY__\n");
|
|
| 154 |
else buffer.append(form.replace("\n", "").replace("<", "<")+ "\n");
|
|
| 155 |
//buffer.append(form+ "\n"); // its a txt file no need to use entities |
|
| 156 |
break; |
|
| 157 |
|
|
| 158 |
case "s": |
|
| 159 |
buffer.append("</s>\n");
|
|
| 160 |
break; |
|
| 161 |
} |
|
| 162 |
break; |
|
| 163 |
|
|
| 164 |
case XMLStreamConstants.CHARACTERS: |
|
| 165 |
if (flagform) {
|
|
| 166 |
if (parser.getText().length() > 0) |
|
| 167 |
form += parser.getText(); |
|
| 168 |
} |
|
| 169 |
break; |
|
| 170 |
} |
|
| 171 |
} |
|
| 172 |
|
|
| 173 |
String str = buffer.toString() |
|
| 174 |
if ("false".equals(TreeTaggerPreferences.getString(TreeTaggerPreferences.FIX_APOSTROPHES, TreeTaggerPreferences.PREFERENCES_NODE))) {
|
|
| 175 |
str = str.replace("’", "'").replace("‘", "'");
|
|
| 176 |
} |
|
| 177 |
output.write(str) |
|
| 178 |
output.close(); |
|
| 179 |
parser.close(); |
|
| 180 |
inputData.close(); |
|
| 181 |
} catch (Exception ex) {
|
|
| 182 |
System.out.println(ex); |
|
| 183 |
return false; |
|
| 184 |
} |
|
| 185 |
|
|
| 186 |
return true; |
|
| 187 |
} |
|
| 188 |
|
|
| 189 |
/** |
|
| 190 |
* The main method. |
|
| 191 |
* |
|
| 192 |
* @param args the arguments |
|
| 193 |
*/ |
|
| 194 |
public static void main(String[] args) {
|
|
| 195 |
|
|
| 196 |
String rootDir = "~/xml/rgaqcj/"; |
|
| 197 |
// new File(rootDir+"/identity/").mkdir(); |
|
| 198 |
|
|
| 199 |
ArrayList<String> milestones = new ArrayList<String>();// the tags who |
|
| 200 |
// you want them |
|
| 201 |
// to stay |
|
| 202 |
// milestones |
|
Formats disponibles : Unified diff