Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / perrault / allimport.groovy @ 966

History | View | Annotate | Download (16.7 kB)

1 321 mdecorde
/**
2 321 mdecorde
 * Main.
3 321 mdecorde
 *
4 321 mdecorde
 * @param args the args
5 321 mdecorde
 */
6 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
7 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
8 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
9 321 mdecorde
// Sophia Antipolis, University of Paris 3.
10 714 mdecorde
//
11 321 mdecorde
// The TXM platform is free software: you can redistribute it
12 321 mdecorde
// and/or modify it under the terms of the GNU General Public
13 321 mdecorde
// License as published by the Free Software Foundation,
14 321 mdecorde
// either version 2 of the License, or (at your option) any
15 321 mdecorde
// later version.
16 714 mdecorde
//
17 321 mdecorde
// The TXM platform is distributed in the hope that it will be
18 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
19 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
20 321 mdecorde
// PURPOSE. See the GNU General Public License for more
21 321 mdecorde
// details.
22 714 mdecorde
//
23 321 mdecorde
// You should have received a copy of the GNU General
24 321 mdecorde
// Public License along with the TXM platform. If not, see
25 321 mdecorde
// http://www.gnu.org/licenses.
26 714 mdecorde
//
27 714 mdecorde
//
28 714 mdecorde
//
29 479 mdecorde
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
30 321 mdecorde
// $LastChangedRevision: 3400 $
31 714 mdecorde
// $LastChangedBy: mdecorde $
32 321 mdecorde
//
33 321 mdecorde
package org.txm.importer.perrault;
34 321 mdecorde
35 321 mdecorde
import javax.xml.stream.*
36 321 mdecorde
37 321 mdecorde
import org.txm.importer.*
38 321 mdecorde
import org.txm.importer.cwb.CwbEncode
39 321 mdecorde
import org.txm.importer.cwb.CwbMakeAll
40 321 mdecorde
import org.txm.importer.filters.*
41 321 mdecorde
import org.txm.scripts.*
42 927 mdecorde
import org.txm.importer.xmltxm.*
43 321 mdecorde
import org.txm.utils.treetagger.TreeTagger
44 321 mdecorde
45 321 mdecorde
import filters.CutHeader.*
46 321 mdecorde
import filters.FusionHeader.*
47 321 mdecorde
import filters.Tokeniser.*
48 321 mdecorde
49 321 mdecorde
// TODO: Auto-generated Javadoc
50 321 mdecorde
/**
51 321 mdecorde
 * Split.
52 321 mdecorde
 *
53 321 mdecorde
 * @param file the file
54 321 mdecorde
 * @return the list
55 321 mdecorde
 */
56 321 mdecorde
public static List<File> split(File file)
57 321 mdecorde
{
58 321 mdecorde
        File f  = file;
59 714 mdecorde
60 321 mdecorde
        println "split file "+f;
61 321 mdecorde
        String rootDir = f.getParent()+"/";
62 321 mdecorde
        String xslfile = rootDir+"splitcorpus.xsl";
63 321 mdecorde
        String outfile = rootDir+"split_temp.xml";
64 714 mdecorde
65 321 mdecorde
        //get the splited file name
66 321 mdecorde
        //String outfilename = new XPathResult(f).getXpathResponse("//TEI/text/body/div/head","");
67 714 mdecorde
68 321 mdecorde
        ApplyXsl a = new ApplyXsl(xslfile);
69 321 mdecorde
        //a.SetParam("xpathtag", "//TEI");//coupe //text
70 321 mdecorde
        //a.SetParam("xpathfilename", "/body/div/head");//cherche a partir de //xpathtag
71 321 mdecorde
        a.process(f.getPath(),outfile);
72 714 mdecorde
73 321 mdecorde
        new File(outfile).delete();
74 714 mdecorde
75 321 mdecorde
        List<File> files = new File(f.getParent(),"split").listFiles();
76 321 mdecorde
}
77 321 mdecorde
78 321 mdecorde
/**
79 321 mdecorde
 * Run1.
80 321 mdecorde
 *
81 321 mdecorde
 * @param srcfiles the srcfiles
82 321 mdecorde
 */
83 321 mdecorde
public void run1(File[] srcfiles)
84 321 mdecorde
{
85 321 mdecorde
        List<File> files = null;
86 321 mdecorde
        File fullfile;
87 321 mdecorde
        String rootDir ="";
88 321 mdecorde
        ArrayList<String> milestones = new ArrayList<String>();//the tags who you want them to stay milestones
89 321 mdecorde
        milestones.add("tagUsage");
90 321 mdecorde
        milestones.add("pb");
91 321 mdecorde
        milestones.add("lb");
92 714 mdecorde
93 321 mdecorde
        for(File f : srcfiles)
94 321 mdecorde
        {
95 321 mdecorde
                files = split(f);
96 321 mdecorde
                rootDir = f.getParent()+"/"
97 321 mdecorde
                fullfile = f;
98 321 mdecorde
                new File(rootDir+"tokenized").deleteDir();
99 321 mdecorde
                new File(rootDir+"tokenized").mkdir();
100 321 mdecorde
                new File(rootDir+"split").deleteDir();
101 321 mdecorde
                new File(rootDir+"split").mkdir();
102 321 mdecorde
                new File(rootDir+"txm").deleteDir();
103 321 mdecorde
                new File(rootDir+"txm").mkdir();
104 714 mdecorde
105 321 mdecorde
                String xslfile = rootDir+"splitcorpus.xsl";
106 321 mdecorde
                String infile = f.getPath();
107 321 mdecorde
                String outfile = rootDir+"split_temp.xml";
108 714 mdecorde
109 321 mdecorde
                ApplyXsl a = new ApplyXsl(xslfile);
110 321 mdecorde
                a.process(infile,outfile);
111 714 mdecorde
112 321 mdecorde
                new File(outfile).delete();
113 321 mdecorde
        }
114 714 mdecorde
115 321 mdecorde
        //get header
116 321 mdecorde
        String header =""
117 321 mdecorde
        Reader reader = new FileReader(fullfile);
118 321 mdecorde
        String cline = reader.readLine();
119 321 mdecorde
        while(!cline.trim().contains("<text>"))
120 321 mdecorde
        {
121 321 mdecorde
                header += cline+"\n";
122 321 mdecorde
                cline = reader.readLine();
123 321 mdecorde
        }
124 714 mdecorde
125 321 mdecorde
        //put splited into tei file
126 321 mdecorde
        for(File f : files)
127 321 mdecorde
        {
128 321 mdecorde
                File temp = new File(f.getParent(),"temp");
129 321 mdecorde
                Writer writer = new FileWriter(temp)
130 321 mdecorde
                writer.write(header);
131 321 mdecorde
                writer.write("""<text>\n<body>\n""")
132 321 mdecorde
                f.eachLine{String line->
133 321 mdecorde
                        if(!line.startsWith("<?xml"))
134 321 mdecorde
                                writer.write(line+"\n");
135 321 mdecorde
                }
136 714 mdecorde
137 321 mdecorde
                writer.write("""</body>\n</text>\n</TEI>""")
138 321 mdecorde
                writer.close();
139 714 mdecorde
140 321 mdecorde
                if (!(f.delete() && temp.renameTo(f))) println "Warning can't rename file "+temp+" to "+f
141 321 mdecorde
        }
142 714 mdecorde
143 321 mdecorde
        //PREPARE EACH SPLITED FILE TO BE TOKENIZED
144 321 mdecorde
        println files
145 321 mdecorde
        for(File f : files)
146 321 mdecorde
        {
147 321 mdecorde
                File srcfile = f;
148 321 mdecorde
                File resultfile = new File(rootDir+"tokenized",f.getName()+"-src.xml");
149 321 mdecorde
                println("prepare tokenizer file : "+srcfile+" to : "+resultfile );
150 321 mdecorde
                def builder = new OneTagPerLine(srcfile.toURL(), milestones);
151 321 mdecorde
                builder.process(resultfile);
152 321 mdecorde
        }
153 714 mdecorde
154 321 mdecorde
        //TOKENIZE FILES
155 714 mdecorde
        //Manager<Filter> filterManager = new FilterManager(ActionHome);
156 321 mdecorde
        for(File f : files)
157 321 mdecorde
        {
158 321 mdecorde
                Sequence S = new Sequence();
159 321 mdecorde
                Filter F1 = new CutHeader();
160 321 mdecorde
                Filter F6 = new Tokeniser(f);
161 321 mdecorde
                Filter F11 = new FusionHeader();
162 321 mdecorde
                S.add(F1);
163 321 mdecorde
                S.add(F6);
164 321 mdecorde
                S.add(F11);
165 321 mdecorde
                File infile = new File(rootDir+"tokenized",f.getName()+"-src.xml");
166 321 mdecorde
                File xmlfile = new File(rootDir+"tokenized",f.getName()+"-out.xml");
167 321 mdecorde
                File headerfile = new File(rootDir+"/tokenized/",f.getName()+"header.xml");
168 321 mdecorde
                println("Tokenize "+xmlfile)
169 321 mdecorde
                S.SetInFileAndOutFile(infile.getPath(), xmlfile.getPath());
170 321 mdecorde
                S.setEncodages("UTF-8","UTF-8");
171 321 mdecorde
                Object[] arguments1 = [headerfile.getAbsolutePath()];
172 321 mdecorde
                F1.SetUsedParam(arguments1);
173 321 mdecorde
                Object[] arguments2 = [headerfile.getAbsolutePath(),F1];
174 321 mdecorde
                F11.SetUsedParam(arguments2);
175 321 mdecorde
                S.proceed();
176 321 mdecorde
                S.clean();
177 321 mdecorde
                infile.delete();//remove the prepared file to clean
178 321 mdecorde
                headerfile.delete();//remove the prepared file to clean
179 321 mdecorde
        }
180 714 mdecorde
        files = new File(rootDir,"tokenized").listFiles()
181 714 mdecorde
182 714 mdecorde
183 321 mdecorde
        //TRANSFORM INTO XML-TEI-TXM
184 321 mdecorde
        for(File f : files)
185 321 mdecorde
        {
186 321 mdecorde
                //ArrayList<String> milestones = new ArrayList<String>();
187 321 mdecorde
                println("build xml-tei-txm "+f+ " >> "+f.getName()+"-TXM.xml")
188 714 mdecorde
                File file = f;
189 321 mdecorde
                String txmfile = f.getName()+"-TXM.xml";
190 714 mdecorde
191 321 mdecorde
                def correspType = new HashMap<String,String>()
192 321 mdecorde
                def correspRef = new HashMap<String,String>()
193 321 mdecorde
                //il faut lister les id de tous les respStmt
194 321 mdecorde
                def respId = [];
195 321 mdecorde
                //fait la correspondance entre le respId et le rapport d'execution de l'outil
196 714 mdecorde
                def applications = new HashMap<String,HashMap<String,String>>();
197 321 mdecorde
                //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
198 714 mdecorde
                //pour construire les ref vers les taxonomies
199 321 mdecorde
                def taxonomiesUtilisees = new HashMap<String,String[]>();
200 321 mdecorde
                //associe un id d'item avec sa description et son URI
201 321 mdecorde
                def itemsURI = new HashMap<String,HashMap<String,String>>();
202 321 mdecorde
                //informations de respStmt
203 321 mdecorde
                //resps (respId <voir ci-dessus>, [description, person, date])
204 321 mdecorde
                def resps = new HashMap<String,String[]>();
205 321 mdecorde
                //lance le traitement
206 321 mdecorde
                String wordprefix = "w_c_";
207 321 mdecorde
                def builder = new Xml2Ana(file);
208 321 mdecorde
                builder.setCorrespondances(correspRef, correspType);
209 321 mdecorde
                builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
210 321 mdecorde
                builder.transformFile(rootDir+"txm/",txmfile);
211 321 mdecorde
        }
212 714 mdecorde
213 321 mdecorde
        //rename files correctly
214 321 mdecorde
        files = new File(rootDir,"txm").listFiles();
215 321 mdecorde
        for(File file : files)
216 321 mdecorde
        {
217 321 mdecorde
                String txmfile = file.getName();
218 321 mdecorde
                txmfile = txmfile.tokenize(".").get(0)+".xml"
219 321 mdecorde
                file.renameTo(new File(file.getParent(),txmfile));
220 321 mdecorde
        }
221 714 mdecorde
222 321 mdecorde
}
223 321 mdecorde
224 321 mdecorde
/**
225 321 mdecorde
 * Run2.
226 321 mdecorde
 *
227 321 mdecorde
 * @param rootDirFile the root dir file
228 321 mdecorde
 */
229 321 mdecorde
public void run2(File rootDirFile)
230 321 mdecorde
{
231 321 mdecorde
        String rootDir = rootDirFile.getAbsolutePath()+"/";
232 321 mdecorde
        if(!new File(rootDir,"models").exists() || !new File(rootDir,"txm").exists())
233 321 mdecorde
        {
234 321 mdecorde
                println "no models dir or no txm dir, check rootDir "+rootDirFile;
235 321 mdecorde
                return;
236 321 mdecorde
        }
237 321 mdecorde
        //cleaning
238 321 mdecorde
        new File(rootDir,"annotations").deleteDir();
239 321 mdecorde
        new File(rootDir,"annotations").mkdir();
240 321 mdecorde
        new File(rootDir,"treetagger").deleteDir();
241 321 mdecorde
        new File(rootDir,"treetagger").mkdir();
242 714 mdecorde
243 321 mdecorde
        ArrayList<String> milestones = new ArrayList<String>();//the tags who you want them to stay milestones
244 321 mdecorde
        milestones.add("tagUsage");
245 321 mdecorde
        milestones.add("pb");
246 321 mdecorde
        milestones.add("lb");
247 714 mdecorde
248 714 mdecorde
        List<File> files = new File(rootDir,"txm").listFiles()
249 321 mdecorde
        //BUILD TT FILE READY TO BE TAGGED
250 321 mdecorde
        for(File f : files)
251 321 mdecorde
        {
252 321 mdecorde
                File srcfile = f;
253 321 mdecorde
                File resultfile = new File(rootDir+"treetagger/",f.getName()+".tt");
254 321 mdecorde
                new BuildTTSrc(srcfile.toURL()).process(resultfile)
255 321 mdecorde
        }
256 714 mdecorde
257 321 mdecorde
        //APPLY TREETAGGER
258 714 mdecorde
        files = new File(rootDir,"treetagger").listFiles()
259 321 mdecorde
        for(File f : files)
260 321 mdecorde
        {
261 321 mdecorde
                File modelfile = new File(rootDir+"models/","fr.par");
262 321 mdecorde
                File infile = f
263 321 mdecorde
                File outfile = new File(f.getParent(),f.getName()+"-out.tt");
264 321 mdecorde
                println("3- APPLY TT on : "+infile+" with : "+modelfile +" >>  "+outfile);
265 714 mdecorde
266 321 mdecorde
                TreeTagger tt = new TreeTagger(System.getProperty("user.home")+"/TXM/treetagger/bin/");
267 321 mdecorde
                tt.settoken();
268 321 mdecorde
                tt.setlemma();
269 321 mdecorde
                tt.setquiet();
270 321 mdecorde
                tt.setsgml();
271 321 mdecorde
                tt.setnounknown();
272 321 mdecorde
                tt.seteostag("<s>");
273 321 mdecorde
                tt.treetagger( modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath())
274 321 mdecorde
                infile.delete();
275 321 mdecorde
        }
276 714 mdecorde
277 321 mdecorde
        //BUILD STAND-OFF FILES
278 321 mdecorde
        //contains txm:application/txm:commandLine
279 321 mdecorde
        File reportFile = new File(rootDir,"NLPToolsParameters.xml");
280 714 mdecorde
281 321 mdecorde
        String respPerson = System.getProperty("user.name");
282 321 mdecorde
        String respId = "txm";
283 321 mdecorde
        String respDesc = "NLP annotation tool";
284 321 mdecorde
        String respDate = "";
285 321 mdecorde
        String respWhen = ""
286 714 mdecorde
287 321 mdecorde
        String appIdent = "TreeTagger";
288 321 mdecorde
        String appVersion = "3.2";
289 714 mdecorde
290 321 mdecorde
        String distributor = "";
291 321 mdecorde
        String publiStmt = """""";
292 321 mdecorde
        String sourceStmt = """""";
293 714 mdecorde
294 321 mdecorde
        def types = ["pos","lemme"];
295 321 mdecorde
        def typesTITLE = ["",""];
296 321 mdecorde
        def typesDesc = ["",""];
297 321 mdecorde
        def typesTAGSET = ["",""];
298 321 mdecorde
        def typesWEB = ["",""];
299 321 mdecorde
        String idform ="w_c_";
300 714 mdecorde
301 714 mdecorde
        files = new File(rootDir,"treetagger").listFiles()
302 321 mdecorde
        for(File f : files)
303 321 mdecorde
        {
304 321 mdecorde
                String target = f.getAbsolutePath();
305 321 mdecorde
                File ttfile = f
306 321 mdecorde
                File posfile = new File(rootDir+"annotations/",f.getName()+"-STOFF.xml");
307 714 mdecorde
308 321 mdecorde
                def encoding ="UTF-8";
309 321 mdecorde
                def transfo = new CSV2W_ANA();
310 321 mdecorde
                println("build w-interp "+ttfile.getName()+ ">>"+posfile.getName())
311 321 mdecorde
                transfo.setAnnotationTypes( types, typesDesc, typesTAGSET, typesWEB, idform);
312 321 mdecorde
                transfo.setResp(respId, respDesc,respDate, respPerson, respWhen);
313 321 mdecorde
                transfo.setApp(appIdent, appVersion);
314 321 mdecorde
                transfo.setTarget(target, reportFile);
315 321 mdecorde
                transfo.setInfos(distributor,  publiStmt, sourceStmt);
316 321 mdecorde
                transfo.process( ttfile, posfile, encoding );
317 321 mdecorde
        }
318 714 mdecorde
319 321 mdecorde
        files = new File(rootDir,"annotations").listFiles();
320 321 mdecorde
        List<File> txmfiles = new File(rootDir,"txm").listFiles();
321 321 mdecorde
        for(int i = 0 ; i< files.size();i++)
322 321 mdecorde
        {
323 321 mdecorde
                File srcfile = txmfiles.get(i);
324 321 mdecorde
                File pos1file = files.get(i);
325 714 mdecorde
                File temp = new File(rootDir,"temp");
326 714 mdecorde
327 321 mdecorde
                println("5- inject annotation in file : "+srcfile+" with : "+pos1file );
328 714 mdecorde
329 321 mdecorde
                def builder = new org.txm.scripts.teitxm.AnnotationInjection(srcfile.toURL(), pos1file.toURL(), milestones);
330 321 mdecorde
                builder.transfomFile(temp.getParent(),temp.getName());
331 714 mdecorde
332 321 mdecorde
                if (!(srcfile.delete() && temp.renameTo(srcfile))) println "Warning can't rename file "+temp+" to "+srcfile
333 321 mdecorde
        }
334 714 mdecorde
335 321 mdecorde
}
336 321 mdecorde
337 321 mdecorde
def inputData;
338 321 mdecorde
def factory;
339 321 mdecorde
XMLStreamReader parser;
340 321 mdecorde
def dir;
341 321 mdecorde
def output;
342 321 mdecorde
def url;
343 321 mdecorde
HashMap<String,String> anahash =new HashMap<String,String>() ;
344 321 mdecorde
String text="";
345 321 mdecorde
String base="";
346 321 mdecorde
String project="";
347 321 mdecorde
348 321 mdecorde
/**
349 321 mdecorde
 * initialize.
350 321 mdecorde
 *
351 321 mdecorde
 * @param url the url
352 321 mdecorde
 * @param text the text
353 321 mdecorde
 * @param base the base
354 321 mdecorde
 * @param project the project
355 321 mdecorde
 */
356 321 mdecorde
public void compil(URL url,String text,String base, String project)
357 321 mdecorde
{
358 321 mdecorde
        this.text = text
359 321 mdecorde
        this.base = base;
360 321 mdecorde
        this.project = project;
361 321 mdecorde
        try {
362 321 mdecorde
                this.url = url;
363 321 mdecorde
                inputData = url.openStream();
364 714 mdecorde
365 321 mdecorde
                factory = XMLInputFactory.newInstance();
366 321 mdecorde
                parser = factory.createXMLStreamReader(inputData);
367 321 mdecorde
        } catch (XMLStreamException ex) {
368 321 mdecorde
                System.out.println(ex);
369 321 mdecorde
        }catch (IOException ex) {
370 321 mdecorde
                System.out.println("IOException while parsing ");
371 321 mdecorde
        }
372 321 mdecorde
}
373 321 mdecorde
374 321 mdecorde
/**
375 321 mdecorde
 * Creates the output.
376 321 mdecorde
 *
377 321 mdecorde
 * @param dirPathName the dir path name
378 321 mdecorde
 * @param fileName the file name
379 321 mdecorde
 * @return true, if successful
380 321 mdecorde
 */
381 321 mdecorde
private boolean createOutput(String dirPathName, String fileName){
382 321 mdecorde
        try {
383 321 mdecorde
                File f = new File(dirPathName, fileName)
384 321 mdecorde
                output = new java.io.FileWriter(f,f.exists())
385 321 mdecorde
                return true;
386 321 mdecorde
        } catch (Exception e) {
387 321 mdecorde
                System.out.println(e.getLocalizedMessage());
388 321 mdecorde
                return false;
389 321 mdecorde
        }
390 321 mdecorde
}
391 321 mdecorde
392 321 mdecorde
/**
393 321 mdecorde
 * clear anaHash variable, it is used to store ana tags values then print it when the end element </ana> is found.
394 321 mdecorde
 */
395 321 mdecorde
private void fillanaHash()
396 321 mdecorde
{
397 321 mdecorde
        anahash.clear();
398 321 mdecorde
        for(String s : types)
399 321 mdecorde
                anahash.put( s,"-" );
400 321 mdecorde
}
401 321 mdecorde
402 321 mdecorde
403 321 mdecorde
/**
404 321 mdecorde
 * Go to text.
405 321 mdecorde
 */
406 321 mdecorde
private void GoToText()
407 321 mdecorde
{
408 714 mdecorde
        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
409 321 mdecorde
        {
410 321 mdecorde
                if(event == XMLStreamConstants.END_ELEMENT)
411 321 mdecorde
                        if(parser.getLocalName().equals("teiHeader"))
412 321 mdecorde
                                return;
413 321 mdecorde
        }
414 321 mdecorde
}
415 321 mdecorde
416 321 mdecorde
/**
417 803 mdecorde
 * Transfom file cqp.
418 321 mdecorde
 *
419 321 mdecorde
 * @param dirPathName the dir path name
420 321 mdecorde
 * @param fileName the file name
421 321 mdecorde
 * @return true, if successful
422 321 mdecorde
 */
423 804 mdecorde
public boolean transfomFileCqp(String dirPathName, String fileName)
424 321 mdecorde
{
425 321 mdecorde
        createOutput(dirPathName, fileName);
426 714 mdecorde
427 321 mdecorde
        String headvalue=""
428 321 mdecorde
        String vAna = "";
429 321 mdecorde
        String vForm = "";
430 321 mdecorde
        String wordid= "";
431 321 mdecorde
        String vHead = "";
432 714 mdecorde
433 321 mdecorde
        int p_id = 0;
434 321 mdecorde
        int q_id = 0;
435 321 mdecorde
        int lg_id = 0;
436 321 mdecorde
        int l_id = 0;
437 714 mdecorde
438 321 mdecorde
        boolean flaglg = false;
439 321 mdecorde
        boolean flaghead = false;
440 321 mdecorde
        boolean flagAuthor = false;
441 321 mdecorde
        boolean flagDate = false;
442 321 mdecorde
        boolean flagForm = false;
443 321 mdecorde
        boolean flagAna = false;
444 714 mdecorde
445 321 mdecorde
        this.GoToText()
446 714 mdecorde
447 714 mdecorde
        try
448 321 mdecorde
        {
449 714 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
450 321 mdecorde
                {
451 714 mdecorde
                        switch (event)
452 321 mdecorde
                        {
453 321 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
454 714 mdecorde
                                        switch (parser.getLocalName())
455 321 mdecorde
                                        {
456 714 mdecorde
457 321 mdecorde
                                                case "head"://get attr lang
458 321 mdecorde
                                                flaghead =true;
459 321 mdecorde
                                                vHead="";
460 321 mdecorde
                                                break;
461 714 mdecorde
462 321 mdecorde
                                                case "text":
463 321 mdecorde
                                                output.write("<text id=\""+text+"\" base=\""+base+"\" project=\""+project+"\">\n");
464 321 mdecorde
                                                break;
465 714 mdecorde
466 321 mdecorde
                                                case "p":
467 321 mdecorde
                                                output.write("<"+parser.getLocalName()+" id=\""+text+"_p_"+(p_id++)+"\">\n");
468 321 mdecorde
                                                break;
469 321 mdecorde
                                                case "q":
470 321 mdecorde
                                                output.write("<"+parser.getLocalName()+" id=\""+text+"_q_"+(q_id++)+"\">\n");
471 321 mdecorde
                                                break;
472 321 mdecorde
                                                case "l":
473 321 mdecorde
                                                output.write("<"+parser.getLocalName()+" id=\""+text+"_l_"+(l_id++)+"\">\n");
474 321 mdecorde
                                                break;
475 714 mdecorde
476 321 mdecorde
                                                case "lg":
477 321 mdecorde
                                                flaglg = true;
478 321 mdecorde
                                                break;
479 714 mdecorde
480 321 mdecorde
                                                case "s":
481 321 mdecorde
                                                output.write( "<s>\n");
482 321 mdecorde
                                                break;
483 714 mdecorde
484 321 mdecorde
                                                case "w":
485 321 mdecorde
                                                for(int i = 0 ; i < parser.getAttributeCount(); i++)
486 321 mdecorde
                                                        if(parser.getAttributeLocalName(i).equals("id"))
487 714 mdecorde
                                                {
488 714 mdecorde
                                                        wordid = parser.getAttributeValue(i);
489 714 mdecorde
                                                }
490 321 mdecorde
                                                break;
491 321 mdecorde
                                                case "form":
492 321 mdecorde
                                                flagForm = true;
493 321 mdecorde
                                                vForm = "";
494 321 mdecorde
                                                vAna ="";
495 321 mdecorde
                                                break;
496 714 mdecorde
497 321 mdecorde
                                                case "ana":
498 321 mdecorde
                                                flagAna = true;
499 321 mdecorde
                                                break;
500 321 mdecorde
                                        }
501 321 mdecorde
                                        break;
502 714 mdecorde
503 321 mdecorde
                                case XMLStreamConstants.END_ELEMENT:
504 714 mdecorde
                                        switch (parser.getLocalName())
505 321 mdecorde
                                        {
506 321 mdecorde
                                                case "head"://get attr lang
507 321 mdecorde
                                                flaghead =false;
508 321 mdecorde
                                                if(flaglg)
509 321 mdecorde
                                                        output.write("<moral id=\""+text+"_moral_"+(lg_id++)+"\" head=\""+vHead+"\">\n");
510 321 mdecorde
                                                break;
511 714 mdecorde
512 321 mdecorde
                                                case "text":
513 321 mdecorde
                                                output.write("</text>\n");
514 321 mdecorde
                                                break;
515 714 mdecorde
516 321 mdecorde
                                                case "p":
517 321 mdecorde
                                                case "q":
518 321 mdecorde
                                                case "l":
519 321 mdecorde
                                                output.write("</"+parser.getLocalName()+">\n");
520 321 mdecorde
                                                break;
521 714 mdecorde
522 321 mdecorde
                                                case "lg":
523 321 mdecorde
                                                output.write("</moral>\n");
524 321 mdecorde
                                                flaglg = false;
525 321 mdecorde
                                                break;
526 714 mdecorde
527 321 mdecorde
                                                case "s":
528 321 mdecorde
                                                output.write( "</s>\n");
529 321 mdecorde
                                                break;
530 714 mdecorde
531 321 mdecorde
                                                case "w":
532 321 mdecorde
                                                if(!(flaghead && flaglg))
533 321 mdecorde
                                                        if(vAna != null)
534 321 mdecorde
                                                                output.write( vForm +vAna+"\t"+wordid+"\n");
535 321 mdecorde
                                                vAna = "";
536 321 mdecorde
                                                vForm = "";
537 321 mdecorde
                                                break;
538 714 mdecorde
539 321 mdecorde
                                                case "form":
540 321 mdecorde
                                                flagForm = false;
541 321 mdecorde
                                                break;
542 714 mdecorde
543 321 mdecorde
                                                case "ana":
544 321 mdecorde
                                                flagAna = false;
545 321 mdecorde
                                                break;
546 321 mdecorde
                                        }
547 321 mdecorde
                                        break;
548 714 mdecorde
549 321 mdecorde
                                case XMLStreamConstants.CHARACTERS:
550 321 mdecorde
                                        if(flagForm)
551 321 mdecorde
                                                vForm += parser.getText().trim();
552 321 mdecorde
                                        if(flagAna)
553 321 mdecorde
                                                vAna += "\t" +parser.getText().trim();
554 321 mdecorde
                                        if(flaghead && flaglg)
555 321 mdecorde
                                                vHead += parser.getText().trim();
556 321 mdecorde
                                        break;
557 321 mdecorde
                        }
558 321 mdecorde
                }
559 321 mdecorde
                output.close();
560 321 mdecorde
                parser.close();
561 321 mdecorde
        }
562 321 mdecorde
        catch (XMLStreamException ex) {
563 321 mdecorde
                System.out.println(ex);
564 321 mdecorde
        }
565 321 mdecorde
        catch (IOException ex) {
566 321 mdecorde
                System.out.println("IOException while parsing " + inputData);
567 321 mdecorde
        }
568 714 mdecorde
569 321 mdecorde
        return true;
570 321 mdecorde
}
571 321 mdecorde
572 321 mdecorde
/**
573 321 mdecorde
 * Run3.
574 321 mdecorde
 *
575 321 mdecorde
 * @param files the files
576 321 mdecorde
 * @return true, if successful
577 321 mdecorde
 */
578 714 mdecorde
public boolean run3(List<File> files)
579 321 mdecorde
{
580 321 mdecorde
        String rootDir ="";
581 714 mdecorde
        if (files.size() > 0)
582 321 mdecorde
                rootDir = files.get(0).getParentFile().getParentFile().getAbsolutePath()+"/";//"~/xml/perrault/";
583 714 mdecorde
584 714 mdecorde
        if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
585 714 mdecorde
                println ("Error: CWB executables not well set.")
586 321 mdecorde
                return false;
587 321 mdecorde
        }
588 321 mdecorde
        if(!new File(rootDir).exists())
589 321 mdecorde
        {
590 321 mdecorde
                println ("binary directory does not exists: "+rootDir)
591 321 mdecorde
                return false;
592 321 mdecorde
        }
593 803 mdecorde
        new File(rootDir+"cqp/","perrault.cqp").delete();//cleaning&preparing
594 803 mdecorde
        new File(rootDir+"cqp/").deleteDir();
595 803 mdecorde
        new File(rootDir+"cqp/").mkdir();
596 321 mdecorde
        new File(rootDir+"registry/").mkdir();
597 714 mdecorde
598 321 mdecorde
        String textid="";
599 321 mdecorde
        int counttext =0;
600 804 mdecorde
        //1- Transform into CQP file
601 321 mdecorde
        for(File f : files)
602 321 mdecorde
        {
603 321 mdecorde
                counttext++;
604 321 mdecorde
                if(!f.exists())
605 321 mdecorde
                {
606 714 mdecorde
                        println("file "+f+ " does not exists")
607 321 mdecorde
                }
608 321 mdecorde
                else
609 714 mdecorde
                {
610 321 mdecorde
                        println("process file "+f)
611 321 mdecorde
                        compil(f.toURL(),"text"+counttext,"perrault","default");
612 804 mdecorde
                        transfomFileCqp(rootDir+"cqp","perrault.cqp");
613 321 mdecorde
                }
614 321 mdecorde
        }
615 714 mdecorde
616 321 mdecorde
        //2- Import into CWB
617 321 mdecorde
        def outDir =rootDir;
618 321 mdecorde
        def outDirTxm = rootDir;
619 714 mdecorde
620 321 mdecorde
        CwbEncode cwbEn = new CwbEncode();
621 321 mdecorde
        CwbMakeAll cwbMa = new CwbMakeAll();
622 714 mdecorde
623 321 mdecorde
        String[] pAttributes = ["id","pos","lemme"];
624 321 mdecorde
        //String[] pAttributes = ["id"];
625 321 mdecorde
        String[] sAttributes = ["text:0+id+base+project","p:0+id","q:0+id","moral:0+head+id","l:0+id"];
626 714 mdecorde
627 321 mdecorde
        try
628 321 mdecorde
        {
629 803 mdecorde
                cwbEn.run(outDirTxm + "data/"+"PERRAULT", outDir + "/cqp/"+"perrault.cqp", outDirTxm + "registry/"+"perrault",pAttributes, sAttributes);
630 714 mdecorde
                cwbMa.run("PERRAULT", outDirTxm + "registry");
631 714 mdecorde
632 321 mdecorde
        } catch (Exception ex) {System.out.println(ex); return false;}
633 714 mdecorde
634 714 mdecorde
        System.out.println("Done.")
635 714 mdecorde
636 321 mdecorde
        return true;
637 321 mdecorde
}
638 321 mdecorde
639 321 mdecorde
////FIN
640 321 mdecorde
println "IMPORTER"
641 321 mdecorde
File[] files = [new File("~/xml/perrault/perrault.xml")];
642 321 mdecorde
run1(files);
643 321 mdecorde
644 321 mdecorde
println "ANNOTATE"
645 321 mdecorde
File rootDir = new File("~/xml/perrault/");
646 321 mdecorde
run2(rootDir);
647 321 mdecorde
648 321 mdecorde
println "COMPIL"
649 321 mdecorde
File directory = new File("~/xml/perrault/txm/");
650 321 mdecorde
files = directory.listFiles();
651 321 mdecorde
ArrayList<File> Lfiles = new ArrayList<File>();
652 321 mdecorde
for(File f : files)
653 321 mdecorde
        Lfiles.add f
654 321 mdecorde
run3(Lfiles);
655 321 mdecorde
656 321 mdecorde
//move registry file to cwb registry dir
657 321 mdecorde
File registryfile = new File("~/xml/perrault/txm/registry/perrault");