Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / teitxm / CSV2W_ANA.groovy @ 479

History | View | Annotate | Download (14.3 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-03-29 09:51:35 +0200 (mar. 29 mars 2016) $
25
// $LastChangedRevision: 3185 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.teitxm;
29

    
30
import java.io.File;
31
import java.io.Writer;
32

    
33
import org.txm.importer.graal.PersonalNamespaceContext;
34
import org.txm.utils.treetagger.TreeTagger;
35
import java.util.Iterator;
36

    
37
import javax.xml.namespace.NamespaceContext;
38

    
39
import java.io.FileInputStream;
40
import java.io.InputStreamReader;
41
import java.io.Reader;
42
import java.nio.charset.Charset;
43
import java.util.Iterator;
44

    
45
import javax.xml.stream.*;
46
// TODO: Auto-generated Javadoc
47
/**  
48
 * @author mdecorde  
49
 * 
50
 * Take a CVS file with columns [word, pos1, pos2, ...] Build xml-pos With given infos : - setAnnotationTypes( types, idform); - setResp( person, software, cmdLine);  types : list of type [pos1, pos2, ...] idform : the ___ of the words' id person : the resp software : the software which creates annotations cmdline : the used cmd line 
51
 * 
52
 */
53
class CSV2W_ANA {
54
        String encoding;
55

    
56
        /** The target. */
57
        String target = "" // ana file targeted
58

    
59
        /** The distributor. */
60
        String distributor= "";
61

    
62
        /** The publi chat. */
63
        String publiChat= "";
64

    
65
        /** The source chat. */
66
        String sourceChat= "";
67

    
68
        /** The reportfile. */
69
        File reportfile;
70

    
71
        /** The resp person. */
72
        String respPerson; // used for respStmt/resp/name(person)
73

    
74
        /** The resp id. */
75
        String respId;
76

    
77
        /** The resp chat. */
78
        String respChat;
79

    
80
        /** The resp date. */
81
        String respDate;
82

    
83
        /** The resp when. */
84
        String respWhen;
85

    
86
        /** The app ident. */
87
        String appIdent;
88

    
89
        /** The app version. */
90
        String appVersion;
91

    
92
        /** The types. */
93
        def types ;// used for linkGrp type
94

    
95
        /** The types desc. */
96
        def typesDesc ;// used for types' description
97

    
98
        /** The types uri. */
99
        def typesURI; // used for types' URI
100

    
101
        /** The types web. */
102
        def typesWEB;//used for types siteweb
103

    
104
        /** The idform. */
105
        String idform ="";
106

    
107
        /** The writer. */
108
        XMLOutputFactory factory = XMLOutputFactory.newInstance();
109
        BufferedOutputStream output;
110
        XMLStreamWriter writer;
111

    
112
        public static String TXMNS = "http://textometrie.org/1.0";
113
        public static String TEINS = "http://www.tei-c.org/ns/1.0";
114

    
115
        /**
116
         * Process.
117
         *
118
         * @param TTrez the tt result file
119
         * @param xmlform the xmlform the standoff file created
120
         * @param encoding the encoding of the result file
121
         * @return true, if successful
122
         */
123
        public boolean process(File TTrez,File xmlform, String encoding )
124
        {
125
                try {
126
                        this.encoding = encoding;
127
                        output = new BufferedOutputStream(new FileOutputStream(xmlform));
128
                        writer = factory.createXMLStreamWriter(output, "UTF-8"); // create a new file
129
                        writer.setNamespaceContext(new PersonalNamespaceContext());
130
                        writeHead();
131
                        writeBody(TTrez,encoding)
132
                        writeTail();
133
                        writer.close();
134
                        output.close();
135
                } catch(Exception e) {
136
                        e.printStackTrace();
137
                        writer.close();
138
                        output.close();
139
                        return false;
140
                }
141
                return true;
142
        }
143

    
144
        /**
145
         * Write head.
146
         */
147
        private void writeHead()
148
        {
149
                Charset UTF8 = Charset.forName("UTF8");
150
                writer.writeStartDocument(encoding, "1.0");
151
                writer.writeStartElement("TEI");
152
                writer.writeDefaultNamespace(TEINS);
153
                writer.writeNamespace("txm", TXMNS);
154
                writer.writeStartElement("teiHeader");
155
                writer.writeStartElement("fileDesc");
156
                writer.writeStartElement("titleStmt");
157
                writer.writeStartElement("title");
158
                writer.writeCharacters("Standoff annotation file for "+target);
159
                writer.writeEndElement(); // title
160
                writer.writeStartElement("respStmt");
161
                writer.writeAttribute("xml:id", respId);
162
                writer.writeStartElement("resp");
163
                writer.writeCharacters(respChat)
164
                writer.writeStartElement("date");
165
                writer.writeAttribute("when", respWhen);
166
                writer.writeCharacters(respDate);
167
                writer.writeEndElement(); // date
168
                writer.writeEndElement(); // resp
169
                writer.writeStartElement("name");
170
                writer.writeAttribute("type", "person");
171
                writer.writeCharacters(respPerson+"\n");
172
                writer.writeEndElement(); // name
173
                writer.writeEndElement(); // respStmt
174
                writer.writeEndElement(); // titleStmt
175
                writer.writeStartElement("publicationStmt")
176
                writer.writeCharacters("\n");
177
                writer.writeStartElement("distributor");
178
                writer.writeCharacters("\n");
179
                writer.flush();
180
                output.write(distributor.getBytes(UTF8))
181
                output.flush()
182
                writer.writeEndElement(); // distributor
183
                writer.writeCharacters("\n");
184
                writer.flush();
185
                output.write(publiChat.getBytes(UTF8));
186
                output.flush()
187
                writer.writeEndElement(); // publicationStmt
188
                writer.writeStartElement("sourceDesc");
189
                writer.writeCharacters("\n");
190
                writer.flush();
191
                output.write(sourceChat.getBytes(UTF8));
192
                output.flush()
193
                writer.writeEndElement(); // sourceDesc
194
                writer.writeEndElement(); // fileDesc
195
                writer.writeStartElement("encodingDesc");
196
                writer.writeStartElement("appInfo");
197
                writer.writeStartElement(TXMNS,"application");
198
                writer.writeAttribute("ident",appIdent);
199
                writer.writeAttribute("version", appVersion);
200
                writer.writeAttribute("resp", respId);
201
                writer.writeCharacters("\n");
202
                writer.flush();
203
                def report = this.reportfile;
204
                if (report != null) {
205
                        //System.out.println("read report : "+report);
206
                        Reader reader = new FileReader(report);
207
                        String line = reader.readLine();
208
                        while (line != null) {
209
                                if (line.length() != 0)
210
                                        output.write((line.replace("&","&")+"\n").getBytes(UTF8));
211
                                line = reader.readLine();
212
                        }
213
                        reader.close();
214
                        output.flush()
215
                } else {
216
                        writer.writeEmptyElement(TXMNS, "commandLine");
217
                        writer.writeCharacters("\n");
218
                }
219

    
220
                writer.writeStartElement("ab");
221
                writer.writeAttribute("type", "annotation");
222
                writer.writeCharacters("\n");
223
                writer.writeStartElement("list");
224
                writer.writeCharacters("\n");
225
                for (int i = 0; i < types.size(); i++) {
226
                        writer.writeStartElement("item");
227
                        writer.writeEmptyElement("ref");
228
                        writer.writeAttribute("type", "tagset");
229
                        writer.writeAttribute("target","#"+types[i])
230
                        writer.writeEndElement(); // item
231
                        writer.writeCharacters("\n");
232
                }
233
                writer.writeEndElement(); // list
234
                writer.writeEndElement(); // ab
235
                writer.writeEndElement(); // txm:application
236
                writer.writeEndElement(); // appInfo
237
                writer.writeStartElement("classDecl");
238
                for (int i = 0; i < types.size(); i++) {
239
                        writer.writeStartElement("taxonomy");
240
                        writer.writeAttribute("xml:id", types[i]);
241
                        writer.writeStartElement("bibl");
242
                        writer.writeStartElement("title");
243
                        writer.writeCharacters(typesDesc[i]);
244
                        writer.writeEndElement(); // title
245
                        if (!typesURI[i].equals("")) {
246
                                writer.writeEmptyElement("ref");
247
                                writer.writeAttribute("type", "tagset");
248
                                writer.writeAttribute("target", typesURI[i]);
249
                                writer.writeCharacters("\n");
250
                        }
251
                        if (!typesWEB[i].equals("")) {
252
                                writer.writeEmptyElement("ref");
253
                                writer.writeAttribute("type", "siteweb");
254
                                writer.writeAttribute("target", typesWEB[i]);
255
                                writer.writeCharacters("\n");
256
                        }
257
                        writer.writeEndElement(); // bibl
258
                        writer.writeCharacters("\n");
259
                        writer.writeEndElement(); // taxonomy
260
                        writer.writeCharacters("\n");
261
                }
262
                writer.writeEndElement(); // classDecl
263
                writer.writeCharacters("\n");
264
                writer.writeEndElement(); // encodingDesc
265
                writer.writeCharacters("\n");
266
                writer.writeEndElement(); // teiHeader
267

    
268
                writer.writeStartElement("text");
269
                writer.writeAttribute("type", "standoff");
270
                writer.writeStartElement("body");
271
                writer.writeCharacters("\n");
272
                writer.writeStartElement("div");
273
                writer.writeCharacters("");
274
        }
275

    
276
        /**
277
         * Write body.
278
         *
279
         * @param TTrez the tt result file
280
         * @param encoding the encoding
281
         * @return the java.lang. object
282
         */
283
        private writeBody(File TTrez, String encoding) {
284
                String tmp = "";
285
                String targets= idform+"";
286

    
287
                //def content = TTrez.getText(encoding)
288
                //content = content.replace("&lt;", "<")
289

    
290
                def separator= "\t"
291
                def maxsize = types.size()+1;
292
                for (int i = 0 ; i< types.size() ; i++) {
293
                        int id = 1;
294
                        //output.write("<linkGrp type=\""+types[i]+"\">\n");
295
                        writer.writeStartElement("linkGrp");
296
                        writer.writeAttribute("type", types[i]);
297
                        TTrez.eachLine("UTF-8") { line ->
298
                                line = line.replaceAll("&lt;", "<")
299
                                def fields = line.split(separator);
300
                                //content.splitEachLine(separator) {fields ->
301
                                        if (fields.size() == maxsize)
302
                                                if (!(fields[0].startsWith("<s>") || fields[0].startsWith("</s>")))// don't use the sentence tags
303
                                        {
304
                                                writer.writeEmptyElement("link");
305
                                                writer.writeAttribute("targets", "#"+targets+id+" #"+ fields[i+1]);
306
                                                id++;
307
                                                writer.writeCharacters("\n");
308
                                        }
309
                                //}
310
                        }
311
                        writer.writeEndElement(); // linkGrp
312
                }
313
        }
314

    
315
        /**
316
         * Write tail.
317
         */
318
        private void writeTail() {
319
                writer.writeEndElement(); // div
320
                writer.writeEndElement(); // body
321
                writer.writeEndElement(); // text
322
                writer.writeEndElement(); // TEI
323
        }
324

    
325
        /**
326
         * Sets the annotation types.
327
         *
328
         * @param types the types
329
         * @param typesDesc the types desc
330
         * @param typesURI the types uri
331
         * @param typesWEB the types web
332
         * @param idform the idform
333
         */
334
        public void setAnnotationTypes(def types,def typesDesc,def typesURI,def typesWEB, String idform) {
335
                this.types = types;
336
                this.typesDesc = typesDesc;
337
                this.typesURI = typesURI;
338
                this.typesWEB = typesWEB;
339
                this.idform = idform;
340
        }
341

    
342
        /**
343
         * Sets the resp.
344
         *
345
         * @param respId the resp id
346
         * @param respChat the resp chat
347
         * @param respDate the resp date
348
         * @param respPerson the resp person
349
         * @param respWhen the resp when
350
         */
351
        public void setResp(respId, respChat,respDate, respPerson,respWhen) {
352
                this.respId = respId;
353
                this.respChat = respChat;
354
                this.respDate = respDate
355
                this.respPerson = respPerson;
356
                this.respWhen = respWhen;
357
        }
358

    
359
        /**
360
         * Sets the target.
361
         *
362
         * @param target the target
363
         * @param reportFile the report file
364
         */
365
        public void setTarget(target, reportFile) {
366
                this.target = target;
367
                this.reportfile = reportFile;
368
        }
369

    
370
        /**
371
         * Sets the app.
372
         *
373
         * @param appIdent the app ident
374
         * @param appVersion the app version
375
         */
376
        public void setApp(String appIdent, String appVersion) {
377
                this.appIdent = appIdent;
378
                this.appVersion = appVersion;
379
        }
380

    
381
        /**
382
         * Sets the infos.
383
         *
384
         * @param distributor the distributor
385
         * @param publiChat the publi chat
386
         * @param sourceChat the source chat
387
         */
388
        public void setInfos(distributor,  publiChat, sourceChat) {
389
                this.distributor = distributor;
390
                this.publiChat = publiChat;
391
                this.sourceChat = sourceChat;
392
        }
393

    
394
        /**
395
         * The main method.
396
         *
397
         * @param args the arguments
398
         */
399
        public static void main(String[] args) {
400

    
401
                String rootDir = "~/xml/rgaqcj/";
402
                new File(rootDir,"proj").mkdir();
403

    
404
                //targeted file for annotations
405
                String target = "roland.xml"
406
                //contains txm:application/txm:commandLine
407
                File reportFile = new File("~/xml/rgaqcj/tt1_REPORT.xml");
408

    
409
                String respPerson ="slh";
410
                String respId = "TT1";
411
                String respDesc = "NLP annotation tool";
412
                String respDate = "Tue Mar  11 1:02:55 Paris, Madrid 2010";
413
                String respWhen="2010-03-11"
414

    
415
                String appIdent = "TreeTagger";
416
                String appVersion = "3.2";
417

    
418
                String distributor = "BFM project - http://bfm.ens-lsh.fr";
419
                String publiStmt = """<availability>
420
                          <p>(c) 2010 Projet BFM - CNRS/ENS-LSH.
421
                          <hi>Conditions d'utilisation</hi> : 
422
                          Sous licence <ref target="http://creativecommons.org/licenses/by-sa/2.0/fr/">Creative Commons</ref>.
423
                          </p>
424
                        </availability>""";
425

    
426
                String sourceStmt = """<p>Born digital: TXM project - http://textometrie.org</p>
427
                        <p>Automatically generated on <date when=" """+respWhen+"""">"""+respDate+"""</date>
428
                        from <ref target="#"""+respId+"""">Tree Tagger</ref> applied on the text roland-tt.
429
                        </p>""";
430

    
431
                def types = ["CATTEX2009"];
432
                def typesTITLE = ["CATTEX2009"];
433
                def typesDesc = ["CATTEX2009 POS description"]
434
                def typesTAGSET = ["http://bfm.ens-lsh.fr/IMG/xml/cattex2009.xml"]
435
                def typesWEB = ["http://bfm.ens-lsh.fr/article.php3?id_article=176"]
436
                String idform ="w_fro_";
437

    
438
                File ttfile = new File(rootDir+"/proj/","rol_rga.tt");
439
                File posfile = new File(rootDir+"/pos/","rolandTT1-w-ana.xml");
440
                String encoding ="UTF-8";
441
                println("Process file : "+ttfile+" to : "+posfile );
442

    
443
                def transfo = new CSV2W_ANA();
444
                transfo.setAnnotationTypes( types, typesDesc, typesTAGSET, typesWEB, idform);
445
                transfo.setResp(respId, respDesc,respDate, respPerson, respWhen);
446
                transfo.setApp(appIdent, appVersion);
447
                transfo.setTarget(target, reportFile);
448
                transfo.setInfos(distributor,  publiStmt, sourceStmt);
449
                transfo.process( ttfile, posfile, encoding );
450

    
451
                //create annotation file for afrpar on roalnd.xml
452
                String infile = rootDir+"/ttsrc/roland-learn.tt";
453
                String modelfile =  rootDir+"/models/afr.par";
454
                String outfile = rootDir+"/proj/afr_rol.tt";
455

    
456
                println("proj "+modelfile+ " on " +infile +" >> "+outfile);
457

    
458
                TreeTagger tt = new TreeTagger("~/Bureau/treetagger/bin/");
459
                tt.settoken();
460
                tt.setquiet();
461
                tt.setsgml();
462
                tt.seteostag("<s>");
463
                tt.treetagger( modelfile, infile, outfile)
464

    
465
                reportFile = new File("~/xml/rgaqcj/tt2_REPORT.xml");
466
                respPerson ="slh";
467
                respId = "TT2";
468
                respDesc = "NLP annotation tool";
469
                respDate = "Tue Mar  15 1:02:55 Paris, Madrid 2010";
470
                respWhen = "2010-03-15"
471

    
472
                types = ["AFRPOS"];
473
                typesDesc = ["AFRPOS description"]
474
                typesWEB = ["http://www.uni-stuttgart.de/lingrom/stein/forschung/resource.html"]
475
                typesTAGSET = [""]
476
                idform ="w_fro_";
477

    
478
                ttfile = new File(rootDir+"/proj/","afr_rol.tt");
479
                posfile = new File(rootDir+"/pos/","rolandTT2-w-ana.xml");
480
                encoding ="UTF-8";
481
                println("Process file : "+ttfile+" to : "+posfile );
482

    
483
                transfo = new CSV2W_ANA();
484
                transfo.setAnnotationTypes( types, typesDesc, typesTAGSET, typesWEB, idform);
485
                transfo.setResp(respId, respDesc,respDate, respPerson,respWhen);
486
                transfo.setTarget(target, reportFile);
487
                transfo.setApp(appIdent, appVersion);
488
                transfo.setInfos(distributor,  publiStmt, sourceStmt);
489
                transfo.process( ttfile, posfile, encoding );
490
        }
491
}