Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / perrault / allimport.groovy @ 966

History | View | Annotate | Download (16.7 kB)

1
/**
2
 * Main.
3
 *
4
 * @param args the args
5
 */
6
// Copyright © 2010-2013 ENS de Lyon.
7
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
8
// Lyon 2, University of Franche-Comté, University of Nice
9
// Sophia Antipolis, University of Paris 3.
10
//
11
// The TXM platform is free software: you can redistribute it
12
// and/or modify it under the terms of the GNU General Public
13
// License as published by the Free Software Foundation,
14
// either version 2 of the License, or (at your option) any
15
// later version.
16
//
17
// The TXM platform is distributed in the hope that it will be
18
// useful, but WITHOUT ANY WARRANTY; without even the implied
19
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
20
// PURPOSE. See the GNU General Public License for more
21
// details.
22
//
23
// You should have received a copy of the GNU General
24
// Public License along with the TXM platform. If not, see
25
// http://www.gnu.org/licenses.
26
//
27
//
28
//
29
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
30
// $LastChangedRevision: 3400 $
31
// $LastChangedBy: mdecorde $
32
//
33
package org.txm.importer.perrault;
34

    
35
import javax.xml.stream.*
36

    
37
import org.txm.importer.*
38
import org.txm.importer.cwb.CwbEncode
39
import org.txm.importer.cwb.CwbMakeAll
40
import org.txm.importer.filters.*
41
import org.txm.scripts.*
42
import org.txm.importer.xmltxm.*
43
import org.txm.utils.treetagger.TreeTagger
44

    
45
import filters.CutHeader.*
46
import filters.FusionHeader.*
47
import filters.Tokeniser.*
48

    
49
// TODO: Auto-generated Javadoc
50
/**
51
 * Split.
52
 *
53
 * @param file the file
54
 * @return the list
55
 */
56
public static List<File> split(File file)
57
{
58
        File f  = file;
59

    
60
        println "split file "+f;
61
        String rootDir = f.getParent()+"/";
62
        String xslfile = rootDir+"splitcorpus.xsl";
63
        String outfile = rootDir+"split_temp.xml";
64

    
65
        //get the splited file name
66
        //String outfilename = new XPathResult(f).getXpathResponse("//TEI/text/body/div/head","");
67

    
68
        ApplyXsl a = new ApplyXsl(xslfile);
69
        //a.SetParam("xpathtag", "//TEI");//coupe //text
70
        //a.SetParam("xpathfilename", "/body/div/head");//cherche a partir de //xpathtag
71
        a.process(f.getPath(),outfile);
72

    
73
        new File(outfile).delete();
74

    
75
        List<File> files = new File(f.getParent(),"split").listFiles();
76
}
77

    
78
/**
79
 * Run1.
80
 *
81
 * @param srcfiles the srcfiles
82
 */
83
public void run1(File[] srcfiles)
84
{
85
        List<File> files = null;
86
        File fullfile;
87
        String rootDir ="";
88
        ArrayList<String> milestones = new ArrayList<String>();//the tags who you want them to stay milestones
89
        milestones.add("tagUsage");
90
        milestones.add("pb");
91
        milestones.add("lb");
92

    
93
        for(File f : srcfiles)
94
        {
95
                files = split(f);
96
                rootDir = f.getParent()+"/"
97
                fullfile = f;
98
                new File(rootDir+"tokenized").deleteDir();
99
                new File(rootDir+"tokenized").mkdir();
100
                new File(rootDir+"split").deleteDir();
101
                new File(rootDir+"split").mkdir();
102
                new File(rootDir+"txm").deleteDir();
103
                new File(rootDir+"txm").mkdir();
104

    
105
                String xslfile = rootDir+"splitcorpus.xsl";
106
                String infile = f.getPath();
107
                String outfile = rootDir+"split_temp.xml";
108

    
109
                ApplyXsl a = new ApplyXsl(xslfile);
110
                a.process(infile,outfile);
111

    
112
                new File(outfile).delete();
113
        }
114

    
115
        //get header
116
        String header =""
117
        Reader reader = new FileReader(fullfile);
118
        String cline = reader.readLine();
119
        while(!cline.trim().contains("<text>"))
120
        {
121
                header += cline+"\n";
122
                cline = reader.readLine();
123
        }
124

    
125
        //put splited into tei file
126
        for(File f : files)
127
        {
128
                File temp = new File(f.getParent(),"temp");
129
                Writer writer = new FileWriter(temp)
130
                writer.write(header);
131
                writer.write("""<text>\n<body>\n""")
132
                f.eachLine{String line->
133
                        if(!line.startsWith("<?xml"))
134
                                writer.write(line+"\n");
135
                }
136

    
137
                writer.write("""</body>\n</text>\n</TEI>""")
138
                writer.close();
139

    
140
                if (!(f.delete() && temp.renameTo(f))) println "Warning can't rename file "+temp+" to "+f
141
        }
142

    
143
        //PREPARE EACH SPLITED FILE TO BE TOKENIZED
144
        println files
145
        for(File f : files)
146
        {
147
                File srcfile = f;
148
                File resultfile = new File(rootDir+"tokenized",f.getName()+"-src.xml");
149
                println("prepare tokenizer file : "+srcfile+" to : "+resultfile );
150
                def builder = new OneTagPerLine(srcfile.toURL(), milestones);
151
                builder.process(resultfile);
152
        }
153

    
154
        //TOKENIZE FILES
155
        //Manager<Filter> filterManager = new FilterManager(ActionHome);
156
        for(File f : files)
157
        {
158
                Sequence S = new Sequence();
159
                Filter F1 = new CutHeader();
160
                Filter F6 = new Tokeniser(f);
161
                Filter F11 = new FusionHeader();
162
                S.add(F1);
163
                S.add(F6);
164
                S.add(F11);
165
                File infile = new File(rootDir+"tokenized",f.getName()+"-src.xml");
166
                File xmlfile = new File(rootDir+"tokenized",f.getName()+"-out.xml");
167
                File headerfile = new File(rootDir+"/tokenized/",f.getName()+"header.xml");
168
                println("Tokenize "+xmlfile)
169
                S.SetInFileAndOutFile(infile.getPath(), xmlfile.getPath());
170
                S.setEncodages("UTF-8","UTF-8");
171
                Object[] arguments1 = [headerfile.getAbsolutePath()];
172
                F1.SetUsedParam(arguments1);
173
                Object[] arguments2 = [headerfile.getAbsolutePath(),F1];
174
                F11.SetUsedParam(arguments2);
175
                S.proceed();
176
                S.clean();
177
                infile.delete();//remove the prepared file to clean
178
                headerfile.delete();//remove the prepared file to clean
179
        }
180
        files = new File(rootDir,"tokenized").listFiles()
181

    
182

    
183
        //TRANSFORM INTO XML-TEI-TXM
184
        for(File f : files)
185
        {
186
                //ArrayList<String> milestones = new ArrayList<String>();
187
                println("build xml-tei-txm "+f+ " >> "+f.getName()+"-TXM.xml")
188
                File file = f;
189
                String txmfile = f.getName()+"-TXM.xml";
190

    
191
                def correspType = new HashMap<String,String>()
192
                def correspRef = new HashMap<String,String>()
193
                //il faut lister les id de tous les respStmt
194
                def respId = [];
195
                //fait la correspondance entre le respId et le rapport d'execution de l'outil
196
                def applications = new HashMap<String,HashMap<String,String>>();
197
                //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
198
                //pour construire les ref vers les taxonomies
199
                def taxonomiesUtilisees = new HashMap<String,String[]>();
200
                //associe un id d'item avec sa description et son URI
201
                def itemsURI = new HashMap<String,HashMap<String,String>>();
202
                //informations de respStmt
203
                //resps (respId <voir ci-dessus>, [description, person, date])
204
                def resps = new HashMap<String,String[]>();
205
                //lance le traitement
206
                String wordprefix = "w_c_";
207
                def builder = new Xml2Ana(file);
208
                builder.setCorrespondances(correspRef, correspType);
209
                builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
210
                builder.transformFile(rootDir+"txm/",txmfile);
211
        }
212

    
213
        //rename files correctly
214
        files = new File(rootDir,"txm").listFiles();
215
        for(File file : files)
216
        {
217
                String txmfile = file.getName();
218
                txmfile = txmfile.tokenize(".").get(0)+".xml"
219
                file.renameTo(new File(file.getParent(),txmfile));
220
        }
221

    
222
}
223

    
224
/**
225
 * Run2.
226
 *
227
 * @param rootDirFile the root dir file
228
 */
229
public void run2(File rootDirFile)
230
{
231
        String rootDir = rootDirFile.getAbsolutePath()+"/";
232
        if(!new File(rootDir,"models").exists() || !new File(rootDir,"txm").exists())
233
        {
234
                println "no models dir or no txm dir, check rootDir "+rootDirFile;
235
                return;
236
        }
237
        //cleaning
238
        new File(rootDir,"annotations").deleteDir();
239
        new File(rootDir,"annotations").mkdir();
240
        new File(rootDir,"treetagger").deleteDir();
241
        new File(rootDir,"treetagger").mkdir();
242

    
243
        ArrayList<String> milestones = new ArrayList<String>();//the tags who you want them to stay milestones
244
        milestones.add("tagUsage");
245
        milestones.add("pb");
246
        milestones.add("lb");
247

    
248
        List<File> files = new File(rootDir,"txm").listFiles()
249
        //BUILD TT FILE READY TO BE TAGGED
250
        for(File f : files)
251
        {
252
                File srcfile = f;
253
                File resultfile = new File(rootDir+"treetagger/",f.getName()+".tt");
254
                new BuildTTSrc(srcfile.toURL()).process(resultfile)
255
        }
256

    
257
        //APPLY TREETAGGER
258
        files = new File(rootDir,"treetagger").listFiles()
259
        for(File f : files)
260
        {
261
                File modelfile = new File(rootDir+"models/","fr.par");
262
                File infile = f
263
                File outfile = new File(f.getParent(),f.getName()+"-out.tt");
264
                println("3- APPLY TT on : "+infile+" with : "+modelfile +" >>  "+outfile);
265

    
266
                TreeTagger tt = new TreeTagger(System.getProperty("user.home")+"/TXM/treetagger/bin/");
267
                tt.settoken();
268
                tt.setlemma();
269
                tt.setquiet();
270
                tt.setsgml();
271
                tt.setnounknown();
272
                tt.seteostag("<s>");
273
                tt.treetagger( modelfile.getAbsolutePath(), infile.getAbsolutePath(), outfile.getAbsolutePath())
274
                infile.delete();
275
        }
276

    
277
        //BUILD STAND-OFF FILES
278
        //contains txm:application/txm:commandLine
279
        File reportFile = new File(rootDir,"NLPToolsParameters.xml");
280

    
281
        String respPerson = System.getProperty("user.name");
282
        String respId = "txm";
283
        String respDesc = "NLP annotation tool";
284
        String respDate = "";
285
        String respWhen = ""
286

    
287
        String appIdent = "TreeTagger";
288
        String appVersion = "3.2";
289

    
290
        String distributor = "";
291
        String publiStmt = """""";
292
        String sourceStmt = """""";
293

    
294
        def types = ["pos","lemme"];
295
        def typesTITLE = ["",""];
296
        def typesDesc = ["",""];
297
        def typesTAGSET = ["",""];
298
        def typesWEB = ["",""];
299
        String idform ="w_c_";
300

    
301
        files = new File(rootDir,"treetagger").listFiles()
302
        for(File f : files)
303
        {
304
                String target = f.getAbsolutePath();
305
                File ttfile = f
306
                File posfile = new File(rootDir+"annotations/",f.getName()+"-STOFF.xml");
307

    
308
                def encoding ="UTF-8";
309
                def transfo = new CSV2W_ANA();
310
                println("build w-interp "+ttfile.getName()+ ">>"+posfile.getName())
311
                transfo.setAnnotationTypes( types, typesDesc, typesTAGSET, typesWEB, idform);
312
                transfo.setResp(respId, respDesc,respDate, respPerson, respWhen);
313
                transfo.setApp(appIdent, appVersion);
314
                transfo.setTarget(target, reportFile);
315
                transfo.setInfos(distributor,  publiStmt, sourceStmt);
316
                transfo.process( ttfile, posfile, encoding );
317
        }
318

    
319
        files = new File(rootDir,"annotations").listFiles();
320
        List<File> txmfiles = new File(rootDir,"txm").listFiles();
321
        for(int i = 0 ; i< files.size();i++)
322
        {
323
                File srcfile = txmfiles.get(i);
324
                File pos1file = files.get(i);
325
                File temp = new File(rootDir,"temp");
326

    
327
                println("5- inject annotation in file : "+srcfile+" with : "+pos1file );
328

    
329
                def builder = new org.txm.scripts.teitxm.AnnotationInjection(srcfile.toURL(), pos1file.toURL(), milestones);
330
                builder.transfomFile(temp.getParent(),temp.getName());
331

    
332
                if (!(srcfile.delete() && temp.renameTo(srcfile))) println "Warning can't rename file "+temp+" to "+srcfile
333
        }
334

    
335
}
336

    
337
def inputData;
338
def factory;
339
XMLStreamReader parser;
340
def dir;
341
def output;
342
def url;
343
HashMap<String,String> anahash =new HashMap<String,String>() ;
344
String text="";
345
String base="";
346
String project="";
347

    
348
/**
349
 * initialize.
350
 *
351
 * @param url the url
352
 * @param text the text
353
 * @param base the base
354
 * @param project the project
355
 */
356
public void compil(URL url,String text,String base, String project)
357
{
358
        this.text = text
359
        this.base = base;
360
        this.project = project;
361
        try {
362
                this.url = url;
363
                inputData = url.openStream();
364

    
365
                factory = XMLInputFactory.newInstance();
366
                parser = factory.createXMLStreamReader(inputData);
367
        } catch (XMLStreamException ex) {
368
                System.out.println(ex);
369
        }catch (IOException ex) {
370
                System.out.println("IOException while parsing ");
371
        }
372
}
373

    
374
/**
375
 * Creates the output.
376
 *
377
 * @param dirPathName the dir path name
378
 * @param fileName the file name
379
 * @return true, if successful
380
 */
381
private boolean createOutput(String dirPathName, String fileName){
382
        try {
383
                File f = new File(dirPathName, fileName)
384
                output = new java.io.FileWriter(f,f.exists())
385
                return true;
386
        } catch (Exception e) {
387
                System.out.println(e.getLocalizedMessage());
388
                return false;
389
        }
390
}
391

    
392
/**
393
 * clear anaHash variable, it is used to store ana tags values then print it when the end element </ana> is found.
394
 */
395
private void fillanaHash()
396
{
397
        anahash.clear();
398
        for(String s : types)
399
                anahash.put( s,"-" );
400
}
401

    
402

    
403
/**
404
 * Go to text.
405
 */
406
private void GoToText()
407
{
408
        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
409
        {
410
                if(event == XMLStreamConstants.END_ELEMENT)
411
                        if(parser.getLocalName().equals("teiHeader"))
412
                                return;
413
        }
414
}
415

    
416
/**
417
 * Transfom file cqp.
418
 *
419
 * @param dirPathName the dir path name
420
 * @param fileName the file name
421
 * @return true, if successful
422
 */
423
public boolean transfomFileCqp(String dirPathName, String fileName)
424
{
425
        createOutput(dirPathName, fileName);
426

    
427
        String headvalue=""
428
        String vAna = "";
429
        String vForm = "";
430
        String wordid= "";
431
        String vHead = "";
432

    
433
        int p_id = 0;
434
        int q_id = 0;
435
        int lg_id = 0;
436
        int l_id = 0;
437

    
438
        boolean flaglg = false;
439
        boolean flaghead = false;
440
        boolean flagAuthor = false;
441
        boolean flagDate = false;
442
        boolean flagForm = false;
443
        boolean flagAna = false;
444

    
445
        this.GoToText()
446

    
447
        try
448
        {
449
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
450
                {
451
                        switch (event)
452
                        {
453
                                case XMLStreamConstants.START_ELEMENT:
454
                                        switch (parser.getLocalName())
455
                                        {
456

    
457
                                                case "head"://get attr lang
458
                                                flaghead =true;
459
                                                vHead="";
460
                                                break;
461

    
462
                                                case "text":
463
                                                output.write("<text id=\""+text+"\" base=\""+base+"\" project=\""+project+"\">\n");
464
                                                break;
465

    
466
                                                case "p":
467
                                                output.write("<"+parser.getLocalName()+" id=\""+text+"_p_"+(p_id++)+"\">\n");
468
                                                break;
469
                                                case "q":
470
                                                output.write("<"+parser.getLocalName()+" id=\""+text+"_q_"+(q_id++)+"\">\n");
471
                                                break;
472
                                                case "l":
473
                                                output.write("<"+parser.getLocalName()+" id=\""+text+"_l_"+(l_id++)+"\">\n");
474
                                                break;
475

    
476
                                                case "lg":
477
                                                flaglg = true;
478
                                                break;
479

    
480
                                                case "s":
481
                                                output.write( "<s>\n");
482
                                                break;
483

    
484
                                                case "w":
485
                                                for(int i = 0 ; i < parser.getAttributeCount(); i++)
486
                                                        if(parser.getAttributeLocalName(i).equals("id"))
487
                                                {
488
                                                        wordid = parser.getAttributeValue(i);
489
                                                }
490
                                                break;
491
                                                case "form":
492
                                                flagForm = true;
493
                                                vForm = "";
494
                                                vAna ="";
495
                                                break;
496

    
497
                                                case "ana":
498
                                                flagAna = true;
499
                                                break;
500
                                        }
501
                                        break;
502

    
503
                                case XMLStreamConstants.END_ELEMENT:
504
                                        switch (parser.getLocalName())
505
                                        {
506
                                                case "head"://get attr lang
507
                                                flaghead =false;
508
                                                if(flaglg)
509
                                                        output.write("<moral id=\""+text+"_moral_"+(lg_id++)+"\" head=\""+vHead+"\">\n");
510
                                                break;
511

    
512
                                                case "text":
513
                                                output.write("</text>\n");
514
                                                break;
515

    
516
                                                case "p":
517
                                                case "q":
518
                                                case "l":
519
                                                output.write("</"+parser.getLocalName()+">\n");
520
                                                break;
521

    
522
                                                case "lg":
523
                                                output.write("</moral>\n");
524
                                                flaglg = false;
525
                                                break;
526

    
527
                                                case "s":
528
                                                output.write( "</s>\n");
529
                                                break;
530

    
531
                                                case "w":
532
                                                if(!(flaghead && flaglg))
533
                                                        if(vAna != null)
534
                                                                output.write( vForm +vAna+"\t"+wordid+"\n");
535
                                                vAna = "";
536
                                                vForm = "";
537
                                                break;
538

    
539
                                                case "form":
540
                                                flagForm = false;
541
                                                break;
542

    
543
                                                case "ana":
544
                                                flagAna = false;
545
                                                break;
546
                                        }
547
                                        break;
548

    
549
                                case XMLStreamConstants.CHARACTERS:
550
                                        if(flagForm)
551
                                                vForm += parser.getText().trim();
552
                                        if(flagAna)
553
                                                vAna += "\t" +parser.getText().trim();
554
                                        if(flaghead && flaglg)
555
                                                vHead += parser.getText().trim();
556
                                        break;
557
                        }
558
                }
559
                output.close();
560
                parser.close();
561
        }
562
        catch (XMLStreamException ex) {
563
                System.out.println(ex);
564
        }
565
        catch (IOException ex) {
566
                System.out.println("IOException while parsing " + inputData);
567
        }
568

    
569
        return true;
570
}
571

    
572
/**
573
 * Run3.
574
 *
575
 * @param files the files
576
 * @return true, if successful
577
 */
578
public boolean run3(List<File> files)
579
{
580
        String rootDir ="";
581
        if (files.size() > 0)
582
                rootDir = files.get(0).getParentFile().getParentFile().getAbsolutePath()+"/";//"~/xml/perrault/";
583

    
584
        if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
585
                println ("Error: CWB executables not well set.")
586
                return false;
587
        }
588
        if(!new File(rootDir).exists())
589
        {
590
                println ("binary directory does not exists: "+rootDir)
591
                return false;
592
        }
593
        new File(rootDir+"cqp/","perrault.cqp").delete();//cleaning&preparing
594
        new File(rootDir+"cqp/").deleteDir();
595
        new File(rootDir+"cqp/").mkdir();
596
        new File(rootDir+"registry/").mkdir();
597

    
598
        String textid="";
599
        int counttext =0;
600
        //1- Transform into CQP file
601
        for(File f : files)
602
        {
603
                counttext++;
604
                if(!f.exists())
605
                {
606
                        println("file "+f+ " does not exists")
607
                }
608
                else
609
                {
610
                        println("process file "+f)
611
                        compil(f.toURL(),"text"+counttext,"perrault","default");
612
                        transfomFileCqp(rootDir+"cqp","perrault.cqp");
613
                }
614
        }
615

    
616
        //2- Import into CWB
617
        def outDir =rootDir;
618
        def outDirTxm = rootDir;
619

    
620
        CwbEncode cwbEn = new CwbEncode();
621
        CwbMakeAll cwbMa = new CwbMakeAll();
622

    
623
        String[] pAttributes = ["id","pos","lemme"];
624
        //String[] pAttributes = ["id"];
625
        String[] sAttributes = ["text:0+id+base+project","p:0+id","q:0+id","moral:0+head+id","l:0+id"];
626

    
627
        try
628
        {
629
                cwbEn.run(outDirTxm + "data/"+"PERRAULT", outDir + "/cqp/"+"perrault.cqp", outDirTxm + "registry/"+"perrault",pAttributes, sAttributes);
630
                cwbMa.run("PERRAULT", outDirTxm + "registry");
631

    
632
        } catch (Exception ex) {System.out.println(ex); return false;}
633

    
634
        System.out.println("Done.")
635

    
636
        return true;
637
}
638

    
639
////FIN
640
println "IMPORTER"
641
File[] files = [new File("~/xml/perrault/perrault.xml")];
642
run1(files);
643

    
644
println "ANNOTATE"
645
File rootDir = new File("~/xml/perrault/");
646
run2(rootDir);
647

    
648
println "COMPIL"
649
File directory = new File("~/xml/perrault/txm/");
650
files = directory.listFiles();
651
ArrayList<File> Lfiles = new ArrayList<File>();
652
for(File f : files)
653
        Lfiles.add f
654
run3(Lfiles);
655

    
656
//move registry file to cwb registry dir
657
File registryfile = new File("~/xml/perrault/txm/registry/perrault");