Révision 927
tmp/TXMBetaPlugins.site/site.xml (revision 927) | ||
---|---|---|
1 | 1 |
<?xml version="1.0" encoding="UTF-8"?> |
2 | 2 |
<site> |
3 |
<feature url="features/org.txm.ahc.feature_1.0.0.qualifier.jar" id="org.txm.ahc.feature" version="1.0.0.qualifier"/>
|
|
4 |
<feature url="features/org.txm.analec.feature_1.0.0.qualifier.jar" id="org.txm.analec.feature" version="1.0.0.qualifier"/>
|
|
5 |
<feature url="features/org.txm.annotation.feature_1.0.0.qualifier.jar" id="org.txm.annotation.feature" version="1.0.0.qualifier"/>
|
|
6 |
<feature url="features/org.txm.annotation.kr.feature_1.0.0.qualifier.jar" id="org.txm.annotation.kr.feature" version="1.0.0.qualifier"/>
|
|
7 |
<feature url="features/org.txm.backtomedia.feature_1.0.0.qualifier.jar" id="org.txm.backtomedia.feature" version="1.0.0.qualifier"/>
|
|
8 |
<feature url="features/org.txm.ca.feature_1.0.0.qualifier.jar" id="org.txm.ca.feature" version="1.0.0.qualifier"/>
|
|
9 |
<feature url="features/org.txm.chartsengine.jfreechart.feature_1.0.0.qualifier.jar" id="org.txm.chartsengine.jfreechart.feature" version="1.0.0.qualifier"/>
|
|
10 |
<feature url="features/org.txm.chartsengine.r.feature_1.0.0.qualifier.jar" id="org.txm.chartsengine.r.feature" version="1.0.0.qualifier"/>
|
|
11 |
<feature url="features/org.txm.concordance.feature_1.0.0.qualifier.jar" id="org.txm.concordance.feature" version="1.0.0.qualifier"/>
|
|
12 |
<feature url="features/org.txm.cooccurrence.feature_1.0.0.qualifier.jar" id="org.txm.cooccurrence.feature" version="1.0.0.qualifier"/>
|
|
13 |
<feature url="features/org.txm.cql2lsa.feature_1.0.0.qualifier.jar" id="org.txm.cql2lsa.feature" version="1.0.0.qualifier"/>
|
|
14 |
<feature url="features/org.txm.dictionary.feature_1.0.0.qualifier.jar" id="org.txm.dictionary.feature" version="1.0.0.qualifier"/>
|
|
15 |
<feature url="features/org.txm.edition.feature_1.0.0.qualifier.jar" id="org.txm.edition.feature" version="1.0.0.qualifier"/>
|
|
16 |
<feature url="features/org.txm.groovy.feature_1.0.0.qualifier.jar" id="org.txm.groovy.feature" version="1.0.0.qualifier"/>
|
|
17 |
<feature url="features/org.txm.index.feature_1.0.0.qualifier.jar" id="org.txm.index.feature" version="1.0.0.qualifier"/>
|
|
18 |
<feature url="features/org.txm.internalview.feature_1.0.0.qualifier.jar" id="org.txm.internalview.feature" version="1.0.0.qualifier"/>
|
|
19 |
<feature url="features/org.txm.lexicaltable.feature_1.0.0.qualifier.jar" id="org.txm.lexicaltable.feature" version="1.0.0.qualifier"/>
|
|
20 |
<feature url="features/org.txm.partition.feature_1.0.0.qualifier.jar" id="org.txm.partition.feature" version="1.0.0.qualifier"/>
|
|
21 |
<feature url="features/org.txm.progression.feature_1.0.0.qualifier.jar" id="org.txm.progression.feature" version="1.0.0.qualifier"/>
|
|
22 |
<feature url="features/org.txm.properties.feature_1.0.0.qualifier.jar" id="org.txm.properties.feature" version="1.0.0.qualifier"/>
|
|
23 |
<feature url="features/org.txm.rcp.feature_1.0.0.qualifier.jar" id="org.txm.rcp.feature" version="1.0.0.qualifier"/>
|
|
24 |
<feature url="features/org.txm.referencer.feature_1.0.0.qualifier.jar" id="org.txm.referencer.feature" version="1.0.0.qualifier"/>
|
|
25 |
<feature url="features/org.txm.searchengine.cqp.feature_1.0.0.qualifier.jar" id="org.txm.searchengine.cqp.feature" version="1.0.0.qualifier"/>
|
|
26 |
<feature url="features/org.txm.specificities.feature_1.0.0.qualifier.jar" id="org.txm.specificities.feature" version="1.0.0.qualifier"/>
|
|
27 |
<feature url="features/org.txm.statsengine.r.feature_1.0.0.qualifier.jar" id="org.txm.statsengine.r.feature" version="1.0.0.qualifier"/>
|
|
28 |
<feature url="features/org.txm.svn.feature_1.0.0.qualifier.jar" id="org.txm.svn.feature" version="1.0.0.qualifier"/>
|
|
29 |
<feature url="features/org.txm.textbalance.feature_1.0.0.qualifier.jar" id="org.txm.textbalance.feature" version="1.0.0.qualifier"/>
|
|
30 |
<feature url="features/org.txm.tigersearch.feature_1.0.0.qualifier.jar" id="org.txm.tigersearch.feature" version="1.0.0.qualifier"/>
|
|
31 |
<feature url="features/org.txm.treetagger.feature_1.0.0.qualifier.jar" id="org.txm.treetagger.feature" version="1.0.0.qualifier"/>
|
|
32 |
<feature url="features/org.txm.treetagger.files.feature_1.0.0.qualifier.jar" id="org.txm.treetagger.files.feature" version="1.0.0.qualifier"/>
|
|
33 |
<feature url="features/org.txm.wordcloud.feature_1.0.0.qualifier.jar" id="org.txm.wordcloud.feature" version="1.0.0.qualifier"/>
|
|
3 |
<feature url="features/org.txm.ahc.feature_1.0.0.201806251537.jar" id="org.txm.ahc.feature" version="1.0.0.201806251537"/>
|
|
4 |
<feature url="features/org.txm.analec.feature_1.0.0.201806251537.jar" id="org.txm.analec.feature" version="1.0.0.201806251537"/>
|
|
5 |
<feature url="features/org.txm.annotation.feature_1.0.0.201806251537.jar" id="org.txm.annotation.feature" version="1.0.0.201806251537"/>
|
|
6 |
<feature url="features/org.txm.annotation.kr.feature_1.0.0.201806251537.jar" id="org.txm.annotation.kr.feature" version="1.0.0.201806251537"/>
|
|
7 |
<feature url="features/org.txm.backtomedia.feature_1.0.0.201806251537.jar" id="org.txm.backtomedia.feature" version="1.0.0.201806251537"/>
|
|
8 |
<feature url="features/org.txm.ca.feature_1.0.0.201806251537.jar" id="org.txm.ca.feature" version="1.0.0.201806251537"/>
|
|
9 |
<feature url="features/org.txm.chartsengine.jfreechart.feature_1.0.0.201806251537.jar" id="org.txm.chartsengine.jfreechart.feature" version="1.0.0.201806251537"/>
|
|
10 |
<feature url="features/org.txm.chartsengine.r.feature_1.0.0.201806251537.jar" id="org.txm.chartsengine.r.feature" version="1.0.0.201806251537"/>
|
|
11 |
<feature url="features/org.txm.concordance.feature_1.0.0.201806251537.jar" id="org.txm.concordance.feature" version="1.0.0.201806251537"/>
|
|
12 |
<feature url="features/org.txm.cooccurrence.feature_1.0.0.201806251537.jar" id="org.txm.cooccurrence.feature" version="1.0.0.201806251537"/>
|
|
13 |
<feature url="features/org.txm.cql2lsa.feature_1.0.0.201806251537.jar" id="org.txm.cql2lsa.feature" version="1.0.0.201806251537"/>
|
|
14 |
<feature url="features/org.txm.dictionary.feature_1.0.0.201806251537.jar" id="org.txm.dictionary.feature" version="1.0.0.201806251537"/>
|
|
15 |
<feature url="features/org.txm.edition.feature_1.0.0.201806251537.jar" id="org.txm.edition.feature" version="1.0.0.201806251537"/>
|
|
16 |
<feature url="features/org.txm.groovy.feature_1.0.0.201806251537.jar" id="org.txm.groovy.feature" version="1.0.0.201806251537"/>
|
|
17 |
<feature url="features/org.txm.index.feature_1.0.0.201806251537.jar" id="org.txm.index.feature" version="1.0.0.201806251537"/>
|
|
18 |
<feature url="features/org.txm.internalview.feature_1.0.0.201806251537.jar" id="org.txm.internalview.feature" version="1.0.0.201806251537"/>
|
|
19 |
<feature url="features/org.txm.lexicaltable.feature_1.0.0.201806251537.jar" id="org.txm.lexicaltable.feature" version="1.0.0.201806251537"/>
|
|
20 |
<feature url="features/org.txm.partition.feature_1.0.0.201806251537.jar" id="org.txm.partition.feature" version="1.0.0.201806251537"/>
|
|
21 |
<feature url="features/org.txm.progression.feature_1.0.0.201806251537.jar" id="org.txm.progression.feature" version="1.0.0.201806251537"/>
|
|
22 |
<feature url="features/org.txm.properties.feature_1.0.0.201806251537.jar" id="org.txm.properties.feature" version="1.0.0.201806251537"/>
|
|
23 |
<feature url="features/org.txm.rcp.feature_1.0.0.201806251537.jar" id="org.txm.rcp.feature" version="1.0.0.201806251537"/>
|
|
24 |
<feature url="features/org.txm.referencer.feature_1.0.0.201806251537.jar" id="org.txm.referencer.feature" version="1.0.0.201806251537"/>
|
|
25 |
<feature url="features/org.txm.searchengine.cqp.feature_1.0.0.201806251537.jar" id="org.txm.searchengine.cqp.feature" version="1.0.0.201806251537"/>
|
|
26 |
<feature url="features/org.txm.specificities.feature_1.0.0.201806251537.jar" id="org.txm.specificities.feature" version="1.0.0.201806251537"/>
|
|
27 |
<feature url="features/org.txm.statsengine.r.feature_1.0.0.201806251537.jar" id="org.txm.statsengine.r.feature" version="1.0.0.201806251537"/>
|
|
28 |
<feature url="features/org.txm.svn.feature_1.0.0.201806251537.jar" id="org.txm.svn.feature" version="1.0.0.201806251537"/>
|
|
29 |
<feature url="features/org.txm.textbalance.feature_1.0.0.201806251537.jar" id="org.txm.textbalance.feature" version="1.0.0.201806251537"/>
|
|
30 |
<feature url="features/org.txm.tigersearch.feature_1.0.0.201806251537.jar" id="org.txm.tigersearch.feature" version="1.0.0.201806251537"/>
|
|
31 |
<feature url="features/org.txm.treetagger.feature_1.0.0.201806251537.jar" id="org.txm.treetagger.feature" version="1.0.0.201806251537"/>
|
|
32 |
<feature url="features/org.txm.treetagger.files.feature_1.0.0.201806251537.jar" id="org.txm.treetagger.files.feature" version="1.0.0.201806251537"/>
|
|
33 |
<feature url="features/org.txm.wordcloud.feature_1.0.0.201806251537.jar" id="org.txm.wordcloud.feature" version="1.0.0.201806251537"/>
|
|
34 | 34 |
</site> |
tmp/org.txm.treetagger.core/plugin.xml (revision 927) | ||
---|---|---|
7 | 7 |
class="org.txm.treetagger.core.preferences.TreeTaggerPreferences"> |
8 | 8 |
</initializer> |
9 | 9 |
</extension> |
10 |
<extension |
|
11 |
point="org.txm.annotation.core.AnnotationEngine"> |
|
12 |
<AnnotationEngine |
|
13 |
class="org.txm.treetagger.core.TreeTaggerEngine" |
|
14 |
description="Call TreeTagger treetagger train processus to annotate corpus"> |
|
15 |
</AnnotationEngine> |
|
16 |
</extension> |
|
10 | 17 |
|
11 | 18 |
</plugin> |
tmp/org.txm.treetagger.core/.settings/org.eclipse.jdt.core.prefs (revision 927) | ||
---|---|---|
1 | 1 |
eclipse.preferences.version=1 |
2 | 2 |
org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled |
3 |
org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate |
|
3 | 4 |
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 |
5 |
org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve |
|
4 | 6 |
org.eclipse.jdt.core.compiler.compliance=1.6 |
7 |
org.eclipse.jdt.core.compiler.debug.lineNumber=generate |
|
8 |
org.eclipse.jdt.core.compiler.debug.localVariable=generate |
|
9 |
org.eclipse.jdt.core.compiler.debug.sourceFile=generate |
|
5 | 10 |
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error |
6 | 11 |
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error |
7 | 12 |
org.eclipse.jdt.core.compiler.source=1.6 |
tmp/org.txm.treetagger.core/.settings/org.eclipse.jdt.groovy.core.prefs (revision 927) | ||
---|---|---|
1 |
eclipse.preferences.version=1 |
|
2 |
groovy.compiler.level=25 |
|
3 |
groovy.script.filters=**/*.dsld,y,**/*.gradle,n |
|
0 | 4 |
tmp/org.txm.treetagger.core/.classpath (revision 927) | ||
---|---|---|
7 | 7 |
</accessrules> |
8 | 8 |
</classpathentry> |
9 | 9 |
<classpathentry kind="src" path="src"/> |
10 |
<classpathentry exported="true" kind="con" path="GROOVY_SUPPORT"/> |
|
11 |
<classpathentry exported="true" kind="con" path="GROOVY_DSL_SUPPORT"/> |
|
10 | 12 |
<classpathentry kind="output" path="bin"/> |
11 | 13 |
</classpath> |
tmp/org.txm.treetagger.core/META-INF/MANIFEST.MF (revision 927) | ||
---|---|---|
1 | 1 |
Manifest-Version: 1.0 |
2 |
Require-Bundle: org.txm.core;bundle-version="0.8.0";visibility:=reexport |
|
3 |
Export-Package: org.txm.treetagger.core.preferences |
|
2 |
Require-Bundle: org.txm.annotation.core;visibility:=reexport |
|
3 |
Export-Package: org.txm.treetagger.core, |
|
4 |
org.txm.treetagger.core.preferences |
|
4 | 5 |
Bundle-ActivationPolicy: lazy |
5 | 6 |
Bundle-Version: 1.0.0.qualifier |
6 | 7 |
Bundle-Name: TreeTagger Core |
tmp/org.txm.treetagger.core/.project (revision 927) | ||
---|---|---|
22 | 22 |
</buildCommand> |
23 | 23 |
</buildSpec> |
24 | 24 |
<natures> |
25 |
<nature>org.eclipse.jdt.groovy.core.groovyNature</nature> |
|
25 | 26 |
<nature>org.eclipse.pde.PluginNature</nature> |
26 | 27 |
<nature>org.eclipse.jdt.core.javanature</nature> |
27 | 28 |
</natures> |
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/AnnotateWTC.groovy (revision 927) | ||
---|---|---|
1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate: 2013-12-06 16:57:35 +0100 (Fri, 06 Dec 2013) $ |
|
25 |
// $LastChangedRevision: 2583 $ |
|
26 |
// $LastChangedBy: mdecorde $ |
|
27 |
// |
|
28 |
package org.txm.importer.xmltxm |
|
29 |
|
|
30 |
import java.io.File |
|
31 |
import java.text.DateFormat |
|
32 |
import java.util.Date |
|
33 |
|
|
34 |
import org.txm.Toolbox |
|
35 |
import org.txm.importer.* |
|
36 |
import org.txm.importer.cwb.* |
|
37 |
import org.txm.objects.* |
|
38 |
import org.txm.stat.utils.ConsoleProgressBar |
|
39 |
import org.txm.utils.DeleteDir; |
|
40 |
import org.txm.utils.LangDetector; |
|
41 |
import org.txm.utils.treetagger.TreeTagger |
|
42 |
|
|
43 |
// TODO: Auto-generated Javadoc |
|
44 |
/** |
|
45 |
* Annotate and replace the TEI-TXM files of the folder $rootDirFile/txm with TreeTagger. |
|
46 |
* creates $rootDirFile/interp and $rootDirFile/treetagger |
|
47 |
* |
|
48 |
*/ |
|
49 |
class AnnotateCQP { |
|
50 |
|
|
51 |
/** The debug. */ |
|
52 |
boolean debug = true; |
|
53 |
|
|
54 |
/** |
|
55 |
* Sets the debug. |
|
56 |
*/ |
|
57 |
public void setDebug() { debug=true; } |
|
58 |
|
|
59 |
String id; |
|
60 |
|
|
61 |
/** |
|
62 |
* Apply tt. |
|
63 |
* |
|
64 |
* @param ttsrcfile the ttsrcfile |
|
65 |
* @param ttoutfile the ttoutfile |
|
66 |
* @param modelfile the modelfile |
|
67 |
* @return true, if successful |
|
68 |
*/ |
|
69 |
public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile) |
|
70 |
{ |
|
71 |
try { |
|
72 |
File infile = ttsrcfile; |
|
73 |
File outfile = ttoutfile; |
|
74 |
|
|
75 |
// TODO: why org.txm.utils.treetagger.TreeTagger tt = ..., throw IllegalAccessError ??? |
|
76 |
String opt = Toolbox.getPreference(Toolbox.TREETAGGER_OPTIONS); |
|
77 |
if (opt == null) opt = ""; |
|
78 |
String[] options = opt.split(" "); |
|
79 |
def tt = new TreeTagger(Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/", options); |
|
80 |
tt.settoken(); |
|
81 |
tt.setlemma(); |
|
82 |
tt.setquiet(); |
|
83 |
tt.setsgml(); |
|
84 |
tt.setnounknown(); |
|
85 |
tt.seteostag("<s>"); |
|
86 |
tt.treetagger(modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath()) |
|
87 |
//infile.delete(); |
|
88 |
} |
|
89 |
catch(Exception e) |
|
90 |
{ |
|
91 |
e.printStackTrace(); |
|
92 |
return false; |
|
93 |
} |
|
94 |
return true; |
|
95 |
} |
|
96 |
|
|
97 |
/** |
|
98 |
* Run. |
|
99 |
* |
|
100 |
* @param rootDirFile the root dir file |
|
101 |
* @param modelfilename the modelfilename |
|
102 |
* @return true, if successful |
|
103 |
*/ |
|
104 |
public boolean run(File binDir, File txmDir, String modelfilename) |
|
105 |
{ |
|
106 |
//test if modelfile exists |
|
107 |
if(debug) { |
|
108 |
println "rootDirFile "+binDir |
|
109 |
println "txmDir "+txmDir |
|
110 |
println "TREETAGGER INSTALL PATH : "+Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH) |
|
111 |
println "TREETAGGER MODELS PATH : "+Toolbox.getPreference(Toolbox.TREETAGGER_MODELS_PATH) |
|
112 |
} |
|
113 |
|
|
114 |
//test if the Toolbox know TreeTagger |
|
115 |
if (!new File(Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/").exists()) |
|
116 |
{ |
|
117 |
println("Could not find TreeTagger binaries in "+Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/") |
|
118 |
return false; |
|
119 |
} |
|
120 |
|
|
121 |
//cleaning |
|
122 |
File annotDir = new File(binDir, "annotations") |
|
123 |
annotDir.mkdir(); |
|
124 |
|
|
125 |
//BUILD TT FILE READY TO BE TAGGED |
|
126 |
List<File> files = txmDir.listFiles() |
|
127 |
|
|
128 |
// get model file and check it |
|
129 |
File modelfile = new File(Toolbox.getPreference(Toolbox.TREETAGGER_MODELS_PATH), modelfilename); |
|
130 |
if (debug) |
|
131 |
println "model file : "+modelfile; |
|
132 |
|
|
133 |
if (!modelfile.exists()) { |
|
134 |
println "Skipping ANNOTATE: Incorrect modelfile path: "+modelfile; |
|
135 |
if(System.getProperty("os.name").startsWith("Windows")) |
|
136 |
println "Windows users: Windows might be hiding files extension. To see them, in the explorer parameters." |
|
137 |
return false; |
|
138 |
} |
|
139 |
|
|
140 |
//APPLY TREETAGGER |
|
141 |
println("Applying $modelfilename TreeTagger model on dir: "+txmDir+ " ("+files.size()+" files)") |
|
142 |
if (files == null || files.size() == 0) |
|
143 |
return false; |
|
144 |
|
|
145 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size()); |
|
146 |
for (File f : files) { |
|
147 |
cpb.tick(); |
|
148 |
|
|
149 |
File infile = f; |
|
150 |
File outfile = new File(new File(binDir, "annotations"), f.getName()); |
|
151 |
if (outfile.exists() && // outfile exists |
|
152 |
outfile.lastModified() >= infile.lastModified() && // outfile is more recent |
|
153 |
outfile.length() > infile.length()) { // outfile is bigger |
|
154 |
// skip |
|
155 |
} else { |
|
156 |
if (!applyTT(infile, outfile, modelfile)) { |
|
157 |
System.out.println("Failed to apply treetagger on file "+f); |
|
158 |
return false; |
|
159 |
} |
|
160 |
} |
|
161 |
} |
|
162 |
println("") |
|
163 |
|
|
164 |
// if (DeleteDir.deleteDirectory(txmDir)) { |
|
165 |
// if (annotDir.renameTo(txmDir)) { |
|
166 |
// |
|
167 |
// } else { |
|
168 |
// println "Could not rename 'annotations' directory to 'txm' directory" |
|
169 |
// return false |
|
170 |
// } |
|
171 |
// } else { |
|
172 |
// println "Could not delete txmDir: $txmDir (and could not copy 'annotations' directory" |
|
173 |
// } |
|
174 |
|
|
175 |
return true; |
|
176 |
} |
|
177 |
} |
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/Annotate.groovy (revision 927) | ||
---|---|---|
1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate: 2016-10-03 15:30:36 +0200 (lun. 03 oct. 2016) $ |
|
25 |
// $LastChangedRevision: 3313 $ |
|
26 |
// $LastChangedBy: mdecorde $ |
|
27 |
// |
|
28 |
package org.txm.importer.xmltxm |
|
29 |
|
|
30 |
import java.io.File |
|
31 |
import java.text.DateFormat |
|
32 |
import java.util.Date |
|
33 |
import java.util.concurrent.* |
|
34 |
|
|
35 |
import org.txm.Toolbox |
|
36 |
import org.txm.importer.* |
|
37 |
import org.txm.importer.cwb.* |
|
38 |
import org.txm.objects.* |
|
39 |
import org.txm.stat.utils.ConsoleProgressBar; |
|
40 |
import org.txm.treetagger.core.preferences.TreeTaggerPreferences |
|
41 |
import org.txm.utils.LangDetector; |
|
42 |
import org.txm.utils.logger.Log; |
|
43 |
import org.txm.utils.treetagger.TreeTagger |
|
44 |
|
|
45 |
/** |
|
46 |
* Annotate and replace the TEI-TXM files of the folder $rootDirFile/txm with TreeTagger. |
|
47 |
* creates $rootDirFile/interp and $rootDirFile/treetagger |
|
48 |
* |
|
49 |
*/ |
|
50 |
class Annotate { |
|
51 |
boolean cancelNow = false; |
|
52 |
|
|
53 |
/** The report file. */ |
|
54 |
File reportFile;//contains the txm:application tag content |
|
55 |
|
|
56 |
/** The resp person. */ |
|
57 |
String respPerson; |
|
58 |
|
|
59 |
/** The resp id. */ |
|
60 |
String respId; |
|
61 |
|
|
62 |
/** The resp desc. */ |
|
63 |
String respDesc; |
|
64 |
|
|
65 |
/** The resp date. */ |
|
66 |
String respDate; |
|
67 |
|
|
68 |
/** The resp when. */ |
|
69 |
String respWhen; |
|
70 |
|
|
71 |
/** The app ident. */ |
|
72 |
String appIdent; |
|
73 |
|
|
74 |
/** The app version. */ |
|
75 |
String appVersion; |
|
76 |
|
|
77 |
/** The distributor. */ |
|
78 |
String distributor; |
|
79 |
|
|
80 |
/** The publi stmt. */ |
|
81 |
String publiStmt; |
|
82 |
|
|
83 |
/** The source stmt. */ |
|
84 |
String sourceStmt; |
|
85 |
|
|
86 |
/** The types. */ |
|
87 |
def types; |
|
88 |
|
|
89 |
/** The types title. */ |
|
90 |
def typesTITLE; |
|
91 |
|
|
92 |
/** The types desc. */ |
|
93 |
def typesDesc; |
|
94 |
|
|
95 |
/** The types tagset. */ |
|
96 |
def typesTAGSET; |
|
97 |
|
|
98 |
/** The types web. */ |
|
99 |
def typesWEB; |
|
100 |
|
|
101 |
/** The idform. */ |
|
102 |
String idform; |
|
103 |
|
|
104 |
/** The debug. */ |
|
105 |
boolean debug = false; |
|
106 |
|
|
107 |
File modelsDirectory; |
|
108 |
|
|
109 |
public Annotate() { |
|
110 |
modelsDirectory = new File(TreeTaggerPreferences.getString(TreeTaggerPreferences.MODELS_PATH, TreeTaggerPreferences.PREFERENCES_NODE)); // default models directory is set in the Toolbox |
|
111 |
} |
|
112 |
|
|
113 |
/** |
|
114 |
* Sets the debug. |
|
115 |
*/ |
|
116 |
public void setDebug() { |
|
117 |
debug = true; |
|
118 |
} |
|
119 |
|
|
120 |
String id; |
|
121 |
/** |
|
122 |
* Inits the tt outfile infos. |
|
123 |
* |
|
124 |
* @param rootDirFile the root dir file |
|
125 |
* @param modelfile the modelfile |
|
126 |
*/ |
|
127 |
public void initTTOutfileInfos(File rootDirFile, File modelfile, String modelfilename) |
|
128 |
{ |
|
129 |
initTTOutfileInfos(rootDirFile, modelfile, modelfilename, null); |
|
130 |
} |
|
131 |
|
|
132 |
/** |
|
133 |
* Inits the tt outfile infos. |
|
134 |
* |
|
135 |
* @param rootDirFile the root dir file |
|
136 |
* @param modelfile the modelfile |
|
137 |
* @param properties : 2 element array that contains the word properties to create. It can be null (the modelfilename will be used) |
|
138 |
*/ |
|
139 |
public void initTTOutfileInfos(File rootDirFile, File modelfile, String modelfilename, String[] properties) |
|
140 |
{ |
|
141 |
id = modelfilename; |
|
142 |
String[] split = id.split("\\."); |
|
143 |
if (split.length > 0) id = split[0]; |
|
144 |
if (id.equals("??")) id = "xx" |
|
145 |
|
|
146 |
reportFile = new File(rootDirFile,"NLPToolsParameters.xml"); |
|
147 |
|
|
148 |
respPerson = System.getProperty("user.name"); |
|
149 |
respId = "txm"; |
|
150 |
respDesc = "NLP annotation tool"; |
|
151 |
respDate = DateFormat.getDateInstance(DateFormat.SHORT, Locale.UK).format(new Date()); |
|
152 |
respWhen = DateFormat.getDateInstance(DateFormat.FULL, Locale.UK).format(new Date()); |
|
153 |
|
|
154 |
appIdent = "TreeTagger"; |
|
155 |
appVersion = "3.2"; |
|
156 |
|
|
157 |
distributor = ""; |
|
158 |
publiStmt = """"""; |
|
159 |
sourceStmt = """"""; |
|
160 |
|
|
161 |
if (properties != null && properties.length == 2) { |
|
162 |
types = [properties[0],properties[1]]; |
|
163 |
typesTITLE = [properties[0],properties[1]]; |
|
164 |
} else { |
|
165 |
types = [id+"pos",id+"lemma"]; |
|
166 |
typesTITLE = [id+"pos",id+"lemma"]; |
|
167 |
} |
|
168 |
|
|
169 |
//TODO: the tagset, website and description should be referenced in the model catalog |
|
170 |
if(modelfile.getName() == "rgaqcj.par") { |
|
171 |
typesDesc = ["CATTEX pos tagset built with BFM texts","fr lemma of the model "+modelfile+" - "] |
|
172 |
typesTAGSET = ["http://bfm.ens-lyon.fr/IMG/pdf/Cattex2009_Manuel.pdf",""] |
|
173 |
typesWEB = ["http://bfm.ens-lyon.fr/",""] |
|
174 |
} else { |
|
175 |
typesDesc = ["pos tagset built from model "+modelfile,id+" lemma of the model "+modelfile+" - "] |
|
176 |
typesTAGSET = ["",""] |
|
177 |
typesWEB = ["",""] |
|
178 |
} |
|
179 |
|
|
180 |
idform ="w"; |
|
181 |
} |
|
182 |
|
|
183 |
/** |
|
184 |
* Apply tt. |
|
185 |
* |
|
186 |
* @param ttsrcfile the ttsrcfile |
|
187 |
* @param ttoutfile the ttoutfile |
|
188 |
* @param modelfile the modelfile |
|
189 |
* @return true, if successful |
|
190 |
*/ |
|
191 |
public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile) { |
|
192 |
applyTT(ttsrcfile, ttoutfile, modelfile, null) |
|
193 |
} |
|
194 |
|
|
195 |
/** |
|
196 |
* Apply tt. |
|
197 |
* |
|
198 |
* @param ttsrcfile the ttsrcfile |
|
199 |
* @param ttoutfile the ttoutfile |
|
200 |
* @param modelfile the modelfile |
|
201 |
* @param options, if null use value set in Toolbox preferences |
|
202 |
* @return true, if successful |
|
203 |
*/ |
|
204 |
public boolean applyTT(File ttsrcfile, File ttoutfile, File modelfile, String[] options) |
|
205 |
{ |
|
206 |
try { |
|
207 |
File infile = ttsrcfile; |
|
208 |
File outfile = ttoutfile; |
|
209 |
|
|
210 |
// TODO: why org.txm.utils.treetagger.TreeTagger tt = ..., throw IllegalAccessError ??? |
|
211 |
String opt = TreeTaggerPreferences.getString(TreeTaggerPreferences.OPTIONS, TreeTaggerPreferences.PREFERENCES_NODE); |
|
212 |
if (options == null) { |
|
213 |
if (opt == null) opt = ""; |
|
214 |
options = opt.split(" "); |
|
215 |
} |
|
216 |
|
|
217 |
def tt = new TreeTagger(TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/", options); |
|
218 |
tt.settoken(); |
|
219 |
tt.setlemma(); |
|
220 |
tt.setquiet(); |
|
221 |
tt.setsgml(); |
|
222 |
tt.setnounknown(); |
|
223 |
tt.seteostag("<s>"); |
|
224 |
tt.debug(debug); |
|
225 |
tt.treetagger(modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath()) |
|
226 |
infile.delete(); |
|
227 |
} catch(Exception e) { |
|
228 |
Log.printStackTrace(e); |
|
229 |
System.out.println("Failed to apply TreeTagger on $f input file with the $modelfile model file."); |
|
230 |
return false; |
|
231 |
} |
|
232 |
return true; |
|
233 |
} |
|
234 |
|
|
235 |
/** |
|
236 |
* Write standoff file. |
|
237 |
* |
|
238 |
* @param ttoutfile the ttoutfile |
|
239 |
* @param posfile the posfile |
|
240 |
* @return true, if successful |
|
241 |
*/ |
|
242 |
public boolean writeStandoffFile(File ttoutfile, File posfile) |
|
243 |
{ |
|
244 |
def encoding ="UTF-8"; |
|
245 |
def transfo = new CSV2W_ANA(); |
|
246 |
//println("build w-interp "+ttfile.getName()+ ">>"+posfile.getName()) |
|
247 |
transfo.setAnnotationTypes( types, typesDesc, typesTAGSET, typesWEB, idform); |
|
248 |
transfo.setResp(respId, respDesc, respDate, respPerson, respWhen); |
|
249 |
transfo.setApp(appIdent, appVersion); |
|
250 |
transfo.setTarget(ttoutfile.getAbsolutePath(), reportFile); |
|
251 |
transfo.setInfos(distributor, publiStmt, sourceStmt); |
|
252 |
return transfo.process( ttoutfile, posfile, encoding ); |
|
253 |
} |
|
254 |
|
|
255 |
/** |
|
256 |
* Run step by step : build TT src files, run TT, build xml-standoff files, inject standoff annotations |
|
257 |
* |
|
258 |
* @param rootDirFile the root dir file |
|
259 |
* @param modelfilename the modelfilename |
|
260 |
* @return true, if successful |
|
261 |
*/ |
|
262 |
public boolean run(File binDir, File txmDir, String modelfilename) |
|
263 |
{ |
|
264 |
//test if modelfile exists |
|
265 |
if (debug) { |
|
266 |
println "rootDirFile "+binDir |
|
267 |
println "txmDir "+txmDir |
|
268 |
println "TREETAGGER INSTALL PATH : "+TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE); |
|
269 |
println "TREETAGGER MODELS PATH : "+TreeTaggerPreferences.getString(TreeTaggerPreferences.MODELS_PATH, TreeTaggerPreferences.PREFERENCES_NODE) |
|
270 |
} |
|
271 |
|
|
272 |
//test if the Toolbox know TreeTagger |
|
273 |
if (!new File(TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/").exists()) { |
|
274 |
println("Could not find TreeTagger binaries in "+TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/") |
|
275 |
return false; |
|
276 |
} |
|
277 |
String langAll = null |
|
278 |
String lang; |
|
279 |
if (modelfilename.startsWith("??")) { |
|
280 |
langAll = new LangDetector(binDir).getLang(); |
|
281 |
println "General lang $langAll" |
|
282 |
} |
|
283 |
|
|
284 |
//cleaning |
|
285 |
new File(binDir, "annotations").deleteDir(); |
|
286 |
new File(binDir, "annotations").mkdir(); |
|
287 |
new File(binDir, "treetagger").deleteDir(); |
|
288 |
new File(binDir, "treetagger").mkdir(); |
|
289 |
|
|
290 |
ArrayList<String> milestones = []; |
|
291 |
|
|
292 |
//BUILD TT FILE READY TO BE TAGGED |
|
293 |
List<File> files = txmDir.listFiles() |
|
294 |
|
|
295 |
println("Building TT source files ("+files.size()+") from directory "+txmDir) |
|
296 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size()) |
|
297 |
for (File f : files) { |
|
298 |
cpb.tick() |
|
299 |
File srcfile = f; |
|
300 |
File resultfile = new File(binDir, "treetagger/"+f.getName()+".tt"); |
|
301 |
if(debug) |
|
302 |
println "build tt src : "+srcfile+" >> "+resultfile |
|
303 |
def ttsrcbuilder = new BuildTTSrc(srcfile.toURI().toURL()) |
|
304 |
if (!ttsrcbuilder.process(resultfile, null)) |
|
305 |
System.out.println("Failed to build tt src file of "+srcfile); |
|
306 |
} |
|
307 |
|
|
308 |
if (cancelNow) return; |
|
309 |
|
|
310 |
File modelDirectory = new File(TreeTaggerPreferences.getString(TreeTaggerPreferences.MODELS_PATH, TreeTaggerPreferences.PREFERENCES_NODE)); |
|
311 |
if (!modelDirectory.exists()) { |
|
312 |
println "Skipping ANNOTATE: TreeTagger language model file directory not found: "+modelDirectory.getAbsolutePath(); |
|
313 |
return false; |
|
314 |
} else if (!modelDirectory.canRead()) { |
|
315 |
println "Skipping ANNOTATE: impossible to access the TreeTagger language model file directory: "+modelDirectory.getAbsolutePath(); |
|
316 |
return false; |
|
317 |
} |
|
318 |
println("") |
|
319 |
//Convert encoding if needed |
|
320 |
|
|
321 |
//APPLY TREETAGGER |
|
322 |
files = new File(binDir, "treetagger").listFiles() |
|
323 |
println("Applying $modelfilename TreeTagger model on dir: "+new File(binDir, "treetagger")+ " ("+files.size()+" files)") |
|
324 |
if (files == null || files.size() == 0) |
|
325 |
return false; |
|
326 |
File modelfile; |
|
327 |
cpb = new ConsoleProgressBar(files.size()) |
|
328 |
for (File f : files) { |
|
329 |
String tmpModelFileName = modelfilename |
|
330 |
if (modelfilename.startsWith("??")) { |
|
331 |
lang = langAll; |
|
332 |
if (f.length() > LangDetector.MINIMALSIZE) { |
|
333 |
lang = new LangDetector(f).getLang(); |
|
334 |
//println "guessing lang $f : $lang" |
|
335 |
} |
|
336 |
tmpModelFileName = lang+".par" |
|
337 |
} |
|
338 |
modelfile = new File(modelsDirectory, tmpModelFileName); |
|
339 |
if (debug) |
|
340 |
println "model file : "+modelfile; |
|
341 |
|
|
342 |
File |
|
343 |
|
|
344 |
if (!modelfile.exists()) { |
|
345 |
println "Skipping ANNOTATE: '$modelfile' TreeTagger language model file not found." |
|
346 |
if(System.getProperty("os.name").startsWith("Windows") || System.getProperty("os.name").startsWith("Mac")) |
|
347 |
println "Windows&Mac users: the operating system might be hiding file extensions. Use your file explorer to check the file name." |
|
348 |
return false; |
|
349 |
} else if (!modelfile.canRead()) { |
|
350 |
println "Skipping ANNOTATE: impossible to access the '$modelfile' TreeTagger language model file." |
|
351 |
return false; |
|
352 |
} |
|
353 |
|
|
354 |
// if (modelfile.getName().equals("sp.par")) {//UTF >> Latin1 |
|
355 |
// if(debug) |
|
356 |
// println "fix encoding for model "+modelfile |
|
357 |
// new EncodingConverter(f, "UTF-8", "ISO-8859-1") |
|
358 |
// } |
|
359 |
|
|
360 |
cpb.tick() |
|
361 |
File infile = f; |
|
362 |
File outfile = new File(f.getParent(),f.getName()+"-out.tt"); |
|
363 |
if (!applyTT(infile, outfile, modelfile)) { |
|
364 |
return false; |
|
365 |
} |
|
366 |
|
|
367 |
// //Reconvert encoding if needed |
|
368 |
// if (modelfile.getName().equals("sp.par")) { |
|
369 |
// if(debug) |
|
370 |
// println "convert "+f+" latin1 >> UTF-8" |
|
371 |
// new EncodingConverter(f, "ISO-8859-1", "UTF-8") |
|
372 |
// } |
|
373 |
} |
|
374 |
println("") |
|
375 |
|
|
376 |
if (cancelNow) return; |
|
377 |
|
|
378 |
//BUILD STAND-OFF FILES |
|
379 |
if (modelfile == null) { |
|
380 |
println "no model applied" |
|
381 |
return false; |
|
382 |
} |
|
383 |
|
|
384 |
initTTOutfileInfos(binDir, modelfile, modelfilename); |
|
385 |
files = new File(binDir, "treetagger").listFiles()// now contains the result files of TT |
|
386 |
println "Building stdoff files ("+files.size()+") from dir:"+new File(binDir, "treetagger")+" to "+new File(binDir, "annotations"); |
|
387 |
if (files == null || files.size() == 0) |
|
388 |
return false; |
|
389 |
cpb = new ConsoleProgressBar(files.size()) |
|
390 |
for (File ttrezfile : files) { |
|
391 |
cpb.tick() |
|
392 |
File annotfile = new File(binDir, "annotations/"+ttrezfile.getName()+"-STOFF.xml"); |
|
393 |
if (!writeStandoffFile(ttrezfile, annotfile)) |
|
394 |
println("Failed to build standoff file of "+ttrezfile); |
|
395 |
} |
|
396 |
println("") |
|
397 |
|
|
398 |
if (cancelNow) return; |
|
399 |
|
|
400 |
//INJECT ANNOTATIONS |
|
401 |
List<File> interpfiles = new File(binDir, "annotations").listFiles(); |
|
402 |
List<File> txmfiles = txmDir.listFiles(); |
|
403 |
if (txmfiles == null) { |
|
404 |
println "No file to annotate in "+txmDir.getAbsolutePath() |
|
405 |
return false; |
|
406 |
} |
|
407 |
interpfiles.sort(); // same order |
|
408 |
txmfiles.sort(); //same order |
|
409 |
println "Injecting stdoff files ("+interpfiles.size()+") data from "+new File(binDir, "annotations")+ " to xml-txm files of "+txmDir; |
|
410 |
if (interpfiles == null || interpfiles.size() == 0) |
|
411 |
return false; |
|
412 |
cpb = new ConsoleProgressBar(interpfiles.size()) |
|
413 |
for (int i = 0 ; i < interpfiles.size() ; i++) { |
|
414 |
cpb.tick() |
|
415 |
File srcfile = txmfiles.get(i); |
|
416 |
File pos1file = interpfiles.get(i); |
|
417 |
File temp = File.createTempFile("Annotate", "temp", srcfile.getParentFile()); |
|
418 |
def builder = new AnnotationInjection(srcfile.toURI().toURL(), pos1file.toURI().toURL()); |
|
419 |
if (!builder.process(temp)) { |
|
420 |
return false; |
|
421 |
} |
|
422 |
builder = null; |
|
423 |
|
|
424 |
//println "renaming files..." |
|
425 |
if (!(srcfile.delete() && temp.renameTo(srcfile))) |
|
426 |
println "Warning can't rename file "+temp+" to "+srcfile |
|
427 |
} |
|
428 |
println("") |
|
429 |
return true; |
|
430 |
} |
|
431 |
|
|
432 |
public void setModelsDirectory(File modelsDirectory) { |
|
433 |
this.modelsDirectory = modelsDirectory; |
|
434 |
} |
|
435 |
|
|
436 |
/** |
|
437 |
* Run file by file. Allow to have one different lang per file. Default behavior add new word properties |
|
438 |
* |
|
439 |
* @param binDir |
|
440 |
* @param txmDir |
|
441 |
* @param lang associate a file name with a model filename |
|
442 |
* @return true, if successful |
|
443 |
*/ |
|
444 |
public boolean run(File binDir, File txmDir, HashMap<String, String> langs) |
|
445 |
{ |
|
446 |
return run(binDir, txmDir, langs, false, new String[0], new String[0]); |
|
447 |
} |
|
448 |
|
|
449 |
/** |
|
450 |
* Run file by file. Allow to have one different lang per file |
|
451 |
* |
|
452 |
* @param binDir |
|
453 |
* @param txmDir |
|
454 |
* @param lang associate a file name with a model filename |
|
455 |
* @param replace, replace or create a word property |
|
456 |
* @return true, if successful |
|
457 |
*/ |
|
458 |
public boolean run(File binDir, File txmDir, HashMap<String, String> langs, boolean replace, String[] properties, String[] options) |
|
459 |
{ |
|
460 |
if (!new File(TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/").exists()) { |
|
461 |
println("Path to TreeTagger is wrong "+TreeTaggerPreferences.getString(TreeTaggerPreferences.INSTALL_PATH, TreeTaggerPreferences.PREFERENCES_NODE)+"/bin/") |
|
462 |
return true; |
|
463 |
} |
|
464 |
|
|
465 |
List<File> listfiles = txmDir.listFiles(); |
|
466 |
|
|
467 |
//cleaning |
|
468 |
File annotDir = new File(binDir,"annotations"); |
|
469 |
annotDir.deleteDir(); |
|
470 |
annotDir.mkdir(); |
|
471 |
File ptreetaggerDir = new File(binDir,"ptreetagger"); |
|
472 |
ptreetaggerDir.deleteDir(); |
|
473 |
ptreetaggerDir.mkdir(); |
|
474 |
File treetaggerDir = new File(binDir,"treetagger"); |
|
475 |
treetaggerDir.deleteDir(); |
|
476 |
treetaggerDir.mkdir(); |
|
477 |
|
|
478 |
int cores = Runtime.getRuntime().availableProcessors() |
|
479 |
int coresToUse = Math.max(1.0, cores * 0.7) |
|
480 |
ExecutorService pool = Executors.newFixedThreadPool(coresToUse) |
|
481 |
|
|
482 |
def files = txmDir.listFiles() |
|
483 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size()) |
|
484 |
for (File teiFile : files) { |
|
485 |
int counter = 1; |
|
486 |
ThreadFile t = new ThreadFile("TT_"+counter++, teiFile) { |
|
487 |
|
|
488 |
public void run() { |
|
489 |
|
|
490 |
if (cancelNow) return; |
|
491 |
if (langs.get(f.getName()) == null) { |
|
492 |
println "Error: no lang defined for file $f" |
|
493 |
return; |
|
494 |
} |
|
495 |
|
|
496 |
String lang = langs.get(f.getName()); |
|
497 |
run(f, lang, binDir, txmDir, replace, properties, options, annotDir, ptreetaggerDir, treetaggerDir) |
|
498 |
|
|
499 |
cpb.tick(); |
|
500 |
} |
|
501 |
}; |
|
502 |
|
|
503 |
pool.execute(t) |
|
504 |
} |
|
505 |
|
|
506 |
pool.shutdown() |
|
507 |
pool.awaitTermination(10, TimeUnit.HOURS) |
|
508 |
println "" |
|
509 |
return true; |
|
510 |
} |
|
511 |
public boolean run(File f, String lang, File binDir, File txmDir) { |
|
512 |
|
|
513 |
File annotDir = new File(binDir,"annotations"); |
|
514 |
annotDir.mkdir(); |
|
515 |
File ptreetaggerDir = new File(binDir,"ptreetagger"); |
|
516 |
ptreetaggerDir.mkdir(); |
|
517 |
File treetaggerDir = new File(binDir,"treetagger"); |
|
518 |
treetaggerDir.mkdir(); |
|
519 |
|
|
520 |
return run(f, lang, binDir, txmDir, false, new String[0], new String[0], annotDir, ptreetaggerDir, treetaggerDir) |
|
521 |
} |
|
522 |
|
|
523 |
|
|
524 |
public boolean run(File f, String lang, File binDir, File txmDir, boolean replace, String[] properties, String[] options, File annotDir, File ptreetaggerDir, File treetaggerDir) { |
|
525 |
|
|
526 |
File modelfile = new File(modelsDirectory, lang+".par"); |
|
527 |
if (!modelfile.exists()) { |
|
528 |
println "Error: No Modelfile available for lang "+modelfile+". Continue import process "; |
|
529 |
return false; |
|
530 |
} |
|
531 |
File annotfile = new File(annotDir, f.getName()+"-STDOFF.xml"); |
|
532 |
File ttsrcfile = new File(ptreetaggerDir, f.getName()+"-src.tt"); |
|
533 |
File ttrezfile = new File(treetaggerDir, f.getName()+"-out.tt"); |
|
534 |
//println ("TT with $model "+f+"+"+annotfile+" > "+ttsrcfile+" > "+ttrezfile); |
|
535 |
|
|
536 |
//BUILD TT FILE READY TO BE TAGGED |
|
537 |
def builder = new BuildTTSrc(f.toURL()); |
|
538 |
builder.process(ttsrcfile, null); |
|
539 |
|
|
540 |
//Apply TT |
|
541 |
applyTT(ttsrcfile, ttrezfile, modelfile, options); |
|
542 |
|
|
543 |
//CREATE STANDOFF FILES |
|
544 |
initTTOutfileInfos(binDir, modelfile, lang, properties); |
|
545 |
writeStandoffFile(ttrezfile, annotfile) |
|
546 |
|
|
547 |
//INJECT ANNOTATIONS |
|
548 |
File tmpFile = new File(txmDir, "temp_"+f.getName()) |
|
549 |
builder = new AnnotationInjection(f.toURL(), annotfile.toURL(), replace); |
|
550 |
builder.process(tmpFile); |
|
551 |
if (!(f.delete() && tmpFile.renameTo(f))) println "Warning can't rename file "+tmpFile+" to "+f |
|
552 |
|
|
553 |
return f.exists(); |
|
554 |
} |
|
555 |
|
|
556 |
public void setCancelNow() { |
|
557 |
cancelNow = true; |
|
558 |
} |
|
559 |
|
|
560 |
public class ThreadFile extends Thread { |
|
561 |
File f; |
|
562 |
public ThreadFile(String name, File f) { |
|
563 |
this.setName(name) |
|
564 |
this.f = f; |
|
565 |
} |
|
566 |
} |
|
567 |
} |
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/AnnotationInjectionFromTTOutput.groovy (revision 927) | ||
---|---|---|
1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate: 2016-03-02 14:17:06 +0100 (mer., 02 mars 2016) $ |
|
25 |
// $LastChangedRevision: 3134 $ |
|
26 |
// $LastChangedBy: mdecorde $ |
|
27 |
// |
|
28 |
package org.txm.importer.xmltxm |
|
29 |
import javax.xml.stream.* |
|
30 |
|
|
31 |
import org.txm.importer.StaxIdentityParser |
|
32 |
import org.txm.importer.filters.* |
|
33 |
import org.txm.importer.PersonalNamespaceContext |
|
34 |
|
|
35 |
/** |
|
36 |
* The Class AnnotationInjection. |
|
37 |
* |
|
38 |
* @author mdecorde |
|
39 |
* |
|
40 |
* inject annotation from a stand-off file into a xml-tei-txm file |
|
41 |
*/ |
|
42 |
|
|
43 |
public class AnnotationInjectionFromTTOutput extends StaxIdentityParser { |
|
44 |
|
|
45 |
public static String TXMNS = "http://textometrie.org/1.0" |
|
46 |
|
|
47 |
/** The xml reader factory. */ |
|
48 |
private def factory; |
|
49 |
|
|
50 |
/** The links. */ |
|
51 |
private List<String> links; |
|
52 |
|
|
53 |
/** The linkparsers. key=type*/ |
|
54 |
private HashMap<String, XMLStreamReader> linkparsers; |
|
55 |
|
|
56 |
/** The anaurl. */ |
|
57 |
private def anaurl; |
|
58 |
|
|
59 |
/** The anainput data. */ |
|
60 |
private def anainputData; |
|
61 |
|
|
62 |
/** The anafactory. */ |
|
63 |
private XMLInputFactory anafactory = XMLInputFactory.newInstance(); |
|
64 |
|
|
65 |
/** The anaparser. */ |
|
66 |
private XMLStreamReader anaparser; |
|
67 |
private XMLStreamReader headerparser; |
|
68 |
|
|
69 |
/** The resp stmt id. */ |
|
70 |
String respStmtID = ""; |
|
71 |
|
|
72 |
/** The present taxonomies. */ |
|
73 |
ArrayList<String> presentTaxonomies = new ArrayList(); |
|
74 |
|
|
75 |
/** |
|
76 |
* Instantiates a new annotation injection. |
|
77 |
* |
|
78 |
* @param url the xml-tei-txm file |
|
79 |
* @param anaurl the stand-off file |
|
80 |
*/ |
|
81 |
public AnnotationInjectionFromTTOutput(URL url, URL anaurl) { |
|
82 |
super(url); // init reader and writer |
|
83 |
try { |
|
84 |
this.anaurl = anaurl; |
|
85 |
factory = XMLInputFactory.newInstance(); |
|
86 |
this.buildLinkParsers();// build a parser per linkgroup |
|
87 |
} catch (XMLStreamException ex) { |
|
88 |
System.out.println(ex); |
|
89 |
} catch (IOException ex) { |
|
90 |
System.out.println("IOException while parsing "); |
|
91 |
} |
|
92 |
} |
|
93 |
|
|
94 |
private void getHeaderInfos(String containertag, boolean captureTheTag) |
|
95 |
{ |
|
96 |
anainputData = new BufferedInputStream(anaurl.openStream()); |
|
97 |
headerparser = anafactory.createXMLStreamReader(anainputData); |
|
98 |
boolean start = false; |
|
99 |
String localname; |
|
100 |
for (int event = headerparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = headerparser.next()) |
|
101 |
{ |
|
102 |
//String prefix = headerparser.getNamespaceURI(); |
|
103 |
if (event == XMLStreamConstants.START_ELEMENT) { |
|
104 |
localname = headerparser.getLocalName(); |
|
105 |
if (captureTheTag && localname == containertag) // start copy after the tag |
|
106 |
start = true; |
|
107 |
if (start) { // copy header |
|
108 |
String prefix = headerparser.getPrefix(); |
|
109 |
if (prefix.length() > 0) |
|
110 |
writer.writeStartElement(Nscontext.getNamespaceURI(prefix), localname) |
|
111 |
else |
|
112 |
writer.writeStartElement(localname); |
|
113 |
for (int i = 0 ; i < headerparser.getNamespaceCount(); i++) |
|
114 |
writer.writeNamespace(headerparser.getNamespacePrefix(i), headerparser.getNamespaceURI(i)); |
|
115 |
for (int i = 0 ; i < headerparser.getAttributeCount(); i++) |
|
116 |
writer.writeAttribute(headerparser.getAttributeLocalName(i), headerparser.getAttributeValue(i)); |
|
117 |
} |
|
118 |
if (!captureTheTag && localname == containertag) // start copy after the tag |
|
119 |
start = true; |
|
120 |
} else if (event == XMLStreamConstants.END_ELEMENT) { |
|
121 |
localname = headerparser.getLocalName(); |
|
122 |
if (!captureTheTag && localname == containertag) |
|
123 |
break;// stop looping |
|
124 |
|
|
125 |
if (start) |
|
126 |
writer.writeEndElement(); |
|
127 |
|
|
128 |
if (captureTheTag && localname == containertag) |
|
129 |
break;// stop looping |
|
130 |
} else if (event == XMLStreamConstants.CHARACTERS) { |
|
131 |
if (start) |
|
132 |
writer.writeCharacters(headerparser.getText()); |
|
133 |
} else if (event == XMLStreamConstants.COMMENT) { |
|
134 |
if (start) |
|
135 |
writer.writeComment(headerparser.getText()); |
|
136 |
} |
|
137 |
} |
|
138 |
headerparser.close(); |
|
139 |
} |
|
140 |
|
|
141 |
/** |
|
142 |
* find all refs. |
|
143 |
* |
|
144 |
* @return the list of link parser |
|
145 |
*/ |
|
146 |
private List<String> findGrpLink() |
|
147 |
{ |
|
148 |
ArrayList<String> links = new ArrayList<String>(); |
|
149 |
anainputData = anaurl.openStream(); |
|
150 |
anaparser = anafactory.createXMLStreamReader(anainputData); |
|
151 |
|
|
152 |
for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) { |
|
153 |
if (event == XMLStreamConstants.START_ELEMENT) { |
|
154 |
if (anaparser.getLocalName().matches("linkGrp")) { |
|
155 |
String targetsvalue = anaparser.getAttributeValue(0) |
|
156 |
|
|
157 |
if (links.contains(targetsvalue)) { |
|
158 |
System.err.println("Warning: Multiple group declaration : "+targetsvalue+" has already been added, the first one will be used") |
|
159 |
} else { |
|
160 |
links.add(targetsvalue); // add the taxonomy type |
|
161 |
} |
|
162 |
} else if (anaparser.getLocalName().matches("respStmt")) { |
|
163 |
respStmtID = anaparser.getAttributeValue(0); // one attribute (id) only |
|
164 |
} |
|
165 |
} |
|
166 |
} |
|
167 |
anaparser.close(); |
|
168 |
return links; |
|
169 |
} |
|
170 |
|
|
171 |
/** |
|
172 |
* Builds the link parsers. |
|
173 |
* I need to know what groups exists to build a parser per taxonomy and go to the first link element |
|
174 |
*/ |
|
175 |
private void buildLinkParsers() |
|
176 |
{ |
|
177 |
// link group of the standoff file |
|
178 |
links = findGrpLink(); |
|
179 |
linkparsers = new HashMap<String, XMLStreamReader>(); |
|
180 |
|
|
181 |
// build one parser per link group |
|
182 |
for (String link : links) { // build a parser per group |
|
183 |
anainputData = new BufferedInputStream(anaurl.openStream()); |
|
184 |
linkparsers.put(link, anafactory.createXMLStreamReader(anainputData)); |
|
185 |
} |
|
186 |
|
|
187 |
//for each parser |
|
188 |
for (String link : links) { |
|
189 |
anaparser = linkparsers.get(link); |
|
190 |
for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) { |
|
191 |
if (event == XMLStreamConstants.START_ELEMENT) { |
|
192 |
if (anaparser.getLocalName().equals("linkGrp")) { // position the parser to the right group |
|
193 |
String targetsvalue = anaparser.getAttributeValue(0) |
|
194 |
if (targetsvalue.equals(link)) |
|
195 |
break; // next element is a link start tag |
|
196 |
} |
|
197 |
} |
|
198 |
} |
|
199 |
} |
|
200 |
} |
|
201 |
|
|
202 |
/** |
|
203 |
* get the next tei:link value of a tei:LinkGrp. |
|
204 |
* |
|
205 |
* @param link the link |
|
206 |
* @return the next ana |
|
207 |
*/ |
|
208 |
private String getNextAnaValue(String link, String wordId) |
|
209 |
{ |
|
210 |
anaparser = linkparsers.get(link); |
|
211 |
def m; |
|
212 |
for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) { |
|
213 |
if (event == XMLStreamConstants.START_ELEMENT) { |
|
214 |
if (anaparser.getLocalName().matches("link")) { |
|
215 |
String targetsvalue = anaparser.getAttributeValue(0) |
|
216 |
if ((m = targetsvalue =~ /#(.*) #(.*)/)) { // balise externe |
|
217 |
def g1 = m[0][1]; |
|
218 |
def g2 = m[0][2]; |
|
219 |
|
|
220 |
String anavalue = g2; |
|
221 |
anavalue = anavalue.replace("<", "<") |
|
222 |
return anavalue; |
|
223 |
} else { |
|
224 |
System.err.println("Error: getNextAna(): link target is not well formed: = "+anaparser.getAttributeValue(0)); |
|
225 |
} |
|
226 |
} |
|
227 |
} |
|
228 |
} |
|
229 |
return ""; |
|
230 |
} |
|
231 |
|
|
232 |
/** |
|
233 |
* build the ana tags of a word. |
|
234 |
* |
|
235 |
* @param wordId the word id |
|
236 |
* @return the ana tag |
|
237 |
*/ |
|
238 |
private void writeAnaTag(String wordId) |
|
239 |
{ |
|
240 |
String anabalises ="\n"; |
|
241 |
for (String link : links) { |
|
242 |
writer.writeStartElement(TXMNS, "ana"); |
|
243 |
writer.writeAttribute("resp", "#"+respStmtID); |
|
244 |
writer.writeAttribute("type", "#"+link); |
|
245 |
writer.writeCharacters(getNextAnaValue(link, wordId)); |
|
246 |
writer.writeEndElement(); // txm:ana |
|
247 |
} |
|
248 |
} |
|
249 |
|
|
250 |
String wordId; |
|
251 |
boolean flagSourceDesc = false; |
|
252 |
protected void processStartElement() |
|
253 |
{ |
|
254 |
if (parser.getLocalName().matches("taxonomy")) { |
|
255 |
String taxo = parser.getAttributeValue(0) // taxonomy type |
|
256 |
presentTaxonomies.add(taxo); |
|
257 |
} |
|
258 |
|
|
259 |
for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) { |
|
260 |
if (parser.getAttributeLocalName(i) == "id") { |
|
261 |
wordId = parser.getAttributeValue(i); |
|
262 |
break |
|
263 |
} |
|
264 |
} |
|
265 |
super.processStartElement(); |
|
266 |
} |
|
267 |
|
|
268 |
boolean applicationWritten = false; |
|
269 |
boolean taxonomiesWritten = false; |
|
270 |
protected void processEndElement() { |
|
271 |
switch (parser.getLocalName()) { |
|
272 |
case "w": |
|
273 |
writeAnaTag(wordId); |
|
274 |
break; |
|
275 |
|
|
276 |
case "appInfo": |
|
277 |
applicationWritten = true; |
|
278 |
getHeaderInfos("appInfo", false); |
|
279 |
break; |
|
280 |
|
|
281 |
case "classDecl": |
|
282 |
taxonomiesWritten = true; |
|
283 |
getHeaderInfos("classDecl", false); |
|
284 |
break; |
|
285 |
|
|
286 |
case "encodingDesc": |
|
287 |
if (!applicationWritten) { |
|
288 |
writer.writeStartElement("appInfo"); |
|
289 |
getHeaderInfos("appInfo", false); |
|
290 |
writer.writeEndElement(); // appInfo |
|
291 |
} |
|
292 |
if (!taxonomiesWritten) { |
|
293 |
writer.writeStartElement("classDecl"); |
|
294 |
getHeaderInfos("classDecl", false); |
|
295 |
writer.writeEndElement(); // classDecl |
|
296 |
} |
|
297 |
break; |
|
298 |
|
|
299 |
case "titleStmt": |
|
300 |
if (flagSourceDesc) { |
|
301 |
//output.write(this.respStmt+"\n") |
|
302 |
getHeaderInfos("respStmt", true); |
|
303 |
flagSourceDesc = false; |
|
304 |
break; |
|
305 |
} |
|
306 |
break; |
|
307 |
} |
|
308 |
super.processEndElement(); |
|
309 |
} |
|
310 |
|
|
311 |
/** The declarenamespace. */ |
|
312 |
boolean declarenamespace = false; |
|
313 |
|
|
314 |
/** |
|
315 |
* Declare namespace. |
|
316 |
* |
|
317 |
* @return the java.lang. object |
|
318 |
*/ |
|
319 |
private declareNamespace() { |
|
320 |
if (!declarenamespace) { |
|
321 |
writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0"); |
|
322 |
writer.writeNamespace("txm", TXMNS); |
|
323 |
declarenamespace = true; |
|
324 |
} |
|
325 |
} |
|
326 |
|
|
327 |
/** |
|
328 |
* The main method. |
|
329 |
* |
|
330 |
* @param args the arguments |
|
331 |
*/ |
|
332 |
public static void main(String[] args) { |
|
333 |
|
|
334 |
String rootDir = "~/xml/rgaqcj/"; |
|
335 |
new File(rootDir + "/injection/").mkdir(); |
|
336 |
|
|
337 |
def milestones = [ |
|
338 |
"tagUsage", |
|
339 |
"pb", |
|
340 |
"lb", |
|
341 |
"catRef"]// the tags who |
|
342 |
File srcfile = new File(rootDir, "/anainline/", "roland.xml"); |
|
343 |
File pos1file = new File(rootDir, "/pos/", "rolandTT1-w-ana.xml"); |
|
344 |
|
|
345 |
File src2file = new File(rootDir, "/injection/", "roland.xml"); |
|
346 |
File pos2file = new File(rootDir, "/pos/", "rolandTT2-w-ana.xml"); |
|
347 |
|
|
348 |
println("process file : " + srcfile + " with : " + pos1file); |
|
349 |
def builder = new AnnotationInjectionFromTTOutput(srcfile.toURI().toURL(), |
|
350 |
pos1file.toURI().toURL(), milestones); |
|
351 |
builder.transfomFile(new File(rootDir + "/injection/", "roland.xml")); |
|
352 |
|
|
353 |
println("process file : " + src2file + " with : " + pos1file); |
|
354 |
builder = new AnnotationInjectionFromTTOutput(src2file.toURI().toURL(), pos2file.toURI().toURL(), |
|
355 |
milestones); |
|
356 |
builder.transfomFile(rootDir + "/injection/", "roland-FINAL.xml"); |
|
357 |
|
|
358 |
return; |
|
359 |
} |
|
360 |
} |
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/package.html (revision 927) | ||
---|---|---|
1 |
<html> |
|
2 |
<body> |
|
3 |
<p>Manages xml-tei-txm files</p> |
|
4 |
</body> |
|
5 |
</html> |
|
0 | 6 |
tmp/org.txm.treetagger.core/src/org/txm/importer/xmltxm/BuildTTSrc.groovy (revision 927) | ||
---|---|---|
1 |
// Copyright © 2010-2013 ENS de Lyon. |
|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice |
|
4 |
// Sophia Antipolis, University of Paris 3. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it |
|
7 |
// and/or modify it under the terms of the GNU General Public |
|
8 |
// License as published by the Free Software Foundation, |
|
9 |
// either version 2 of the License, or (at your option) any |
|
10 |
// later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be |
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
|
15 |
// PURPOSE. See the GNU General Public License for more |
|
16 |
// details. |
|
17 |
// |
|
18 |
// You should have received a copy of the GNU General |
|
19 |
// Public License along with the TXM platform. If not, see |
|
20 |
// http://www.gnu.org/licenses. |
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 |
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $ |
|
25 |
// $LastChangedRevision: 3400 $ |
|
26 |
// $LastChangedBy: mdecorde $ |
|
27 |
// |
|
28 |
package org.txm.importer.xmltxm |
|
29 |
|
|
30 |
import java.text.DateFormat; |
|
31 |
import java.util.Date; |
|
32 |
import java.util.ArrayList; |
|
33 |
import javax.xml.stream.*; |
|
34 |
import java.net.URL; |
|
35 |
|
|
36 |
import org.txm.Toolbox; |
|
37 |
import org.txm.importer.filters.*; |
|
38 |
import org.txm.treetagger.core.preferences.TreeTaggerPreferences |
|
39 |
|
|
40 |
// TODO: Auto-generated Javadoc |
|
41 |
/** |
|
42 |
* The Class BuildTTSrc. |
|
43 |
* |
|
44 |
* @author mdecorde |
|
45 |
* build the TT source for tigerSearch |
|
46 |
*/ |
|
47 |
|
|
48 |
public class BuildTTSrc { |
|
49 |
|
|
50 |
/** The url. */ |
|
51 |
private def url; |
|
52 |
|
|
53 |
/** The input data. */ |
|
54 |
private def inputData; |
|
55 |
|
|
56 |
/** The factory. */ |
|
57 |
private def factory; |
|
58 |
|
|
59 |
/** The parser. */ |
|
60 |
private XMLStreamReader parser; |
|
61 |
|
|
62 |
/** The output. */ |
|
63 |
private BufferedWriter output; |
|
64 |
|
|
65 |
/** |
|
66 |
* Instantiates a new builds the tt src. |
|
67 |
* uses XML-TXM V2 |
|
68 |
* |
|
69 |
* @param url the url of the file to process |
|
70 |
*/ |
|
71 |
public BuildTTSrc(URL url) { |
|
72 |
try { |
|
73 |
this.url = url; |
|
74 |
inputData = url.openStream(); |
|
75 |
factory = XMLInputFactory.newInstance(); |
|
76 |
parser = factory.createXMLStreamReader(inputData); |
|
77 |
|
|
78 |
} catch (XMLStreamException ex) { |
|
79 |
System.out.println(ex); |
|
80 |
} catch (IOException ex) { |
|
81 |
System.out.println("IOException while parsing "); |
|
82 |
} |
|
83 |
} |
|
84 |
|
|
85 |
/** |
|
86 |
* Creates the output. |
|
87 |
* |
|
88 |
* @param outfile the outfile |
|
89 |
* @return true, if successful |
|
90 |
*/ |
|
91 |
private boolean createOutput(File outfile) { |
|
92 |
try { |
|
93 |
File f = outfile; |
|
94 |
output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), |
|
95 |
"UTF-8")); |
|
96 |
return true; |
|
97 |
} catch (Exception e) { |
|
98 |
System.out.println(e.getLocalizedMessage()); |
|
99 |
return false; |
|
100 |
} |
|
101 |
} |
|
102 |
|
|
103 |
/** |
|
104 |
* Process. |
|
105 |
* |
|
106 |
* @param outfile the outfile |
|
107 |
* @param formtype, if multiple form, use this param to choose the correct one, if null takes the first form found |
|
108 |
* @return true, if successful |
|
109 |
*/ |
|
110 |
public boolean process(File outfile, String formtype) { |
|
111 |
if (!createOutput(outfile)) |
|
112 |
return false; |
|
113 |
|
|
114 |
boolean flagform = false; // to catch the content of the form tag |
|
115 |
boolean firstform = false; // to know if its the first form of the w element |
|
116 |
String form = ""; // the content of the form tag |
|
117 |
String lastopenlocalname = ""; |
|
118 |
String localname = ""; |
|
119 |
StringBuffer buffer = new StringBuffer(); |
|
120 |
try { |
|
121 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
|
122 |
switch (event) { |
|
123 |
case XMLStreamConstants.START_ELEMENT: |
|
124 |
localname = parser.getLocalName(); |
|
125 |
switch (localname) { |
|
126 |
case "w": |
|
127 |
//firstform = true; |
|
128 |
break; |
|
129 |
case "form": |
|
130 |
// if (firstform) { |
|
131 |
// if (formtype != null) { |
|
132 |
// if(parser.getAttributeCount() > 0 |
|
133 |
// && parser.getAttributeValue(0).equals(formtype)) // only one attribute in form, type |
|
134 |
// flagform = true; |
|
135 |
// } |
|
136 |
// else |
|
137 |
flagform = true; |
|
138 |
form = ""; |
|
139 |
firstform = false; |
|
140 |
//} |
|
141 |
break; |
|
142 |
case "s": // TreeTagger can use s tags |
|
143 |
buffer.append("<s>\n"); |
|
144 |
break; |
|
145 |
} |
|
146 |
break; |
|
147 |
case XMLStreamConstants.END_ELEMENT: |
|
148 |
localname = parser.getLocalName(); |
|
149 |
switch (localname) { |
|
150 |
case "form": |
|
151 |
flagform = false; |
|
152 |
form = form.trim() |
|
153 |
if (form.length() == 0) buffer.append("__EMPTY__\n"); |
|
154 |
else buffer.append(form.replace("\n", "").replace("<", "<")+ "\n"); |
|
155 |
//buffer.append(form+ "\n"); // its a txt file no need to use entities |
|
156 |
break; |
|
157 |
|
|
158 |
case "s": |
|
159 |
buffer.append("</s>\n"); |
|
160 |
break; |
|
161 |
} |
|
162 |
break; |
|
163 |
|
|
164 |
case XMLStreamConstants.CHARACTERS: |
|
165 |
if (flagform) { |
|
166 |
if (parser.getText().length() > 0) |
|
167 |
form += parser.getText(); |
|
168 |
} |
|
169 |
break; |
|
170 |
} |
|
171 |
} |
|
172 |
|
|
173 |
String str = buffer.toString() |
|
174 |
if ("false".equals(TreeTaggerPreferences.getString(TreeTaggerPreferences.FIX_APOSTROPHES, TreeTaggerPreferences.PREFERENCES_NODE))) { |
|
175 |
str = str.replace("’", "'").replace("‘", "'"); |
|
176 |
} |
|
177 |
output.write(str) |
|
178 |
output.close(); |
|
179 |
parser.close(); |
|
180 |
inputData.close(); |
|
181 |
} catch (Exception ex) { |
|
182 |
System.out.println(ex); |
|
183 |
return false; |
|
184 |
} |
|
185 |
|
|
186 |
return true; |
|
187 |
} |
|
188 |
|
|
189 |
/** |
|
190 |
* The main method. |
|
191 |
* |
|
192 |
* @param args the arguments |
|
193 |
*/ |
|
194 |
public static void main(String[] args) { |
|
195 |
|
|
196 |
String rootDir = "~/xml/rgaqcj/"; |
|
197 |
// new File(rootDir+"/identity/").mkdir(); |
|
198 |
|
|
199 |
ArrayList<String> milestones = new ArrayList<String>();// the tags who |
|
200 |
// you want them |
|
201 |
// to stay |
|
202 |
// milestones |
Formats disponibles : Unified diff