root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / corptef / annotate.groovy @ 1488
History | View | Annotate | Download (4.7 kB)
1 |
// Copyright © 2010-2013 ENS de Lyon.
|
---|---|
2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 |
// Lyon 2, University of Franche-Comté, University of Nice
|
4 |
// Sophia Antipolis, University of Paris 3.
|
5 |
//
|
6 |
// The TXM platform is free software: you can redistribute it
|
7 |
// and/or modify it under the terms of the GNU General Public
|
8 |
// License as published by the Free Software Foundation,
|
9 |
// either version 2 of the License, or (at your option) any
|
10 |
// later version.
|
11 |
//
|
12 |
// The TXM platform is distributed in the hope that it will be
|
13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 |
// PURPOSE. See the GNU General Public License for more
|
16 |
// details.
|
17 |
//
|
18 |
// You should have received a copy of the GNU General
|
19 |
// Public License along with the TXM platform. If not, see
|
20 |
// http://www.gnu.org/licenses.
|
21 |
//
|
22 |
//
|
23 |
//
|
24 |
// $LastChangedDate: 2013-11-08 13:38:06 +0100 (ven. 08 nov. 2013) $
|
25 |
// $LastChangedRevision: 2569 $
|
26 |
// $LastChangedBy: mdecorde $
|
27 |
//
|
28 |
package org.txm.scripts.importer.corptef;
|
29 |
|
30 |
import org.txm.importer.cwb.* |
31 |
import org.txm.importer.scripts.xmltxm.*; |
32 |
import org.txm.scripts.importer.*; |
33 |
import org.txm.utils.treetagger.TreeTagger; |
34 |
import org.txm.Toolbox; |
35 |
|
36 |
// TODO: Auto-generated Javadoc
|
37 |
/**
|
38 |
* The Class annotate.
|
39 |
*/
|
40 |
class annotate { |
41 |
|
42 |
/**
|
43 |
* Run.
|
44 |
*
|
45 |
* @param dir the dir
|
46 |
*/
|
47 |
public void run(File dir) |
48 |
{ |
49 |
String rootDir = dir.getAbsolutePath()+"/";//"~/xml/discours/src"; |
50 |
String txmDir = dir.getAbsolutePath()+"/txm/" |
51 |
|
52 |
//cleaning
|
53 |
new File(rootDir,"annotations").deleteDir(); |
54 |
new File(rootDir,"annotations").mkdir(); |
55 |
new File(rootDir,"treetagger").deleteDir(); |
56 |
new File(rootDir,"treetagger").mkdir(); |
57 |
println txmDir |
58 |
List<File> listfiles = new File(txmDir).listFiles(); |
59 |
for(File teifile : listfiles) |
60 |
{ |
61 |
println("annotate "+teifile)
|
62 |
File modelfile = new File(Toolbox.getPreference(Toolbox.TREETAGGER_MODELS_PATH),"/rgaqcj.par"); |
63 |
if(!modelfile.exists())
|
64 |
{ |
65 |
println "Skipping ANNOTATE: Incorrect modelfile path: "+modelfile;
|
66 |
return;
|
67 |
} |
68 |
File annotfile = new File(rootDir+"annotations",teifile.getName()+"-STDOFF.xml"); |
69 |
File ttsrcfile = new File(rootDir+"treetagger",teifile.getName()+"-src.tt"); |
70 |
File ttrezfile = new File(rootDir+"treetagger",teifile.getName()+"-out.tt"); |
71 |
|
72 |
//prepare file to be tagged
|
73 |
def builder = new BuildTTSrc(teifile.toURL()); |
74 |
builder.process(ttsrcfile); |
75 |
|
76 |
//Apply TT
|
77 |
if(!new File(Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/").exists() || Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH).length() == 0 ) |
78 |
{ |
79 |
println("Path to TreeTagger is wrong: "+Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/") |
80 |
return;
|
81 |
} |
82 |
TreeTagger tt = new TreeTagger(Toolbox.getPreference(Toolbox.TREETAGGER_INSTALL_PATH)+"/bin/"); |
83 |
tt.settoken(); |
84 |
tt.setlemma(); |
85 |
tt.setquiet(); |
86 |
tt.setnounknown(); |
87 |
tt.setsgml(); |
88 |
tt.seteostag("<s>");
|
89 |
tt.treetagger( modelfile.getAbsolutePath(), ttsrcfile.getAbsolutePath(), ttrezfile.getAbsolutePath()) |
90 |
|
91 |
//create stand-off annotation file
|
92 |
//targeted file for annotations
|
93 |
String target = teifile.getName()
|
94 |
|
95 |
|
96 |
//contains txm:application/txm:commandLine
|
97 |
File reportFile = new File(rootDir,"NLPToolsParameters.xml"); |
98 |
|
99 |
String respPerson = System.getProperty("user.name"); |
100 |
String respId = "txm"; |
101 |
String respDesc = "NLP annotation tool"; |
102 |
String respDate = "Tue Mar 11 1:02:55 Paris, Madrid 2010"; |
103 |
String respWhen = "" |
104 |
|
105 |
String appIdent = "TreeTagger"; |
106 |
String appVersion = "3.2"; |
107 |
|
108 |
String distributor = ""; |
109 |
String publiStmt = """"""; |
110 |
String sourceStmt = """"""; |
111 |
|
112 |
def types = ["ttpos","ttlemma"]; |
113 |
def typesTITLE = ["ttpos","ttlemma"]; |
114 |
def typesDesc = ["fr pos","fr lemma"] |
115 |
def typesTAGSET = ["",""] |
116 |
def typesWEB = ["",""] |
117 |
String idform ="w_"; |
118 |
String encoding ="UTF-8"; |
119 |
|
120 |
def transfo = new CSV2W_ANA(); |
121 |
transfo.setAnnotationTypes( types, typesDesc, typesTAGSET, typesWEB, idform); |
122 |
transfo.setResp(respId, respDesc,respDate, respPerson, respWhen); |
123 |
transfo.setApp(appIdent, appVersion); |
124 |
transfo.setTarget(target, reportFile); |
125 |
transfo.setInfos(distributor, publiStmt, sourceStmt); |
126 |
transfo.process( ttrezfile, annotfile, encoding ); |
127 |
|
128 |
//merge into the tei file
|
129 |
builder = new AnnotationInjection(teifile.toURL(), annotfile.toURL(), new ArrayList<String>()); |
130 |
builder.transfomFile(rootDir,"temp");
|
131 |
|
132 |
if (!(teifile.delete() && new File(rootDir,"temp").renameTo(teifile))) println "Warning can't rename file "+new File(rootDir,"temp")+" to "+teifile |
133 |
} |
134 |
} |
135 |
|
136 |
/**
|
137 |
* The main method.
|
138 |
*
|
139 |
* @param args the arguments
|
140 |
*/
|
141 |
public static void main(String[] args) |
142 |
{ |
143 |
File dir = new File("C:/Documents and Settings/alavrent/TXM/corpora/corptef/") |
144 |
new annotate().run(dir);
|
145 |
} |
146 |
} |