Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / nlp / TnT.groovy @ 1000

History | View | Annotate | Download (13 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate: 2013-05-06 17:38:43 +0200 (lun. 06 mai 2013) $
25
// $LastChangedRevision: 2386 $
26
// $LastChangedBy: mdecorde $ 
27
//
28
package org.txm.scripts.scripts;
29

    
30
import java.util.Date;
31
import java.util.Locale;
32
import java.text.DateFormat;
33
import java.io.BufferedReader;
34
import java.io.IOException;
35
import java.io.InputStream;
36
import java.io.InputStreamReader;
37
import java.util.ArrayList;
38

    
39
// Trigrams'n'Tags : part-of-speach tagger
40
// TODO: Auto-generated Javadoc
41

    
42
/**
43
 * The Class TnT.
44
 */
45
class TnT {
46
        
47
        /** The binpath. */
48
        String binpath = "";
49

    
50
        /**
51
         * Instantiates a new tn t.
52
         *
53
         * @param binpath the binpath
54
         */
55
        public TnT(String binpath) {
56
                this.binpath = binpath;
57
        }
58

    
59
        /** The version. */
60
        String version = "0.0.0";
61
        
62
        /** The desc. */
63
        String desc = "Trigrams'n'Tags : part-of-speach tagger";
64
        
65
        /** The debug. */
66
        boolean debug = false;
67

    
68
        /**
69
         * Debug.
70
         *
71
         * @param b the b
72
         */
73
        public void debug(boolean b) {
74
                debug = b;
75
        };
76

    
77
        // use suffix trie with max. suffix length = len; default = 10
78
        
79
        /** The isa. */
80
        private Boolean isa = false;
81
        
82
        /** The a. */
83
        private int a;
84

    
85
        /**
86
         * Sets the a.
87
         *
88
         * @param arg the new a
89
         */
90
        public void seta(int arg) {
91
                this.isa = true;
92
                this.a = arg;
93
        }
94

    
95
        /**
96
         * Unseta.
97
         */
98
        public void unseta() {
99
                this.isa = false;
100
        }
101

    
102
        // use fil as backup lexicon
103
        
104
        /** The isb. */
105
        private Boolean isb = false;
106
        
107
        /** The b. */
108
        private File b;
109

    
110
        /**
111
         * Sets the b.
112
         *
113
         * @param arg the new b
114
         */
115
        public void setb(File arg) {
116
                this.isb = true;
117
                this.b = arg;
118
        }
119

    
120
        /**
121
         * Unsetb.
122
         */
123
        public void unsetb() {
124
                this.isb = false;
125
        }
126

    
127
        // backup mode : 0=main only, 1=mix, 2=backup only (default: 1)
128
        
129
        /** The is b. */
130
        private Boolean isB = false;
131
        
132
        /** The B. */
133
        private int B;
134

    
135
        /**
136
         * Sets the b.
137
         *
138
         * @param arg the new b
139
         */
140
        public void setB(int arg) {
141
                this.isB = true;
142
                this.B = arg;
143
        }
144

    
145
        /**
146
         * Unset b.
147
         */
148
        public void unsetB() {
149
                this.isB = false;
150
        }
151

    
152
        // sparse data mode
153
        
154
        /** The isd. */
155
        private Boolean isd = false;
156
        
157
        /** The d. */
158
        private String d;
159

    
160
        /**
161
         * Sets the d.
162
         *
163
         * @param arg the new d
164
         */
165
        public void setd(String arg) {
166
                this.isd = true;
167
                this.d = arg;
168
        }
169

    
170
        /**
171
         * Unsetd.
172
         */
173
        public void unsetd() {
174
                this.isd = false;
175
        }
176

    
177
        // copy HTML tags to output (without tagging)
178
        
179
        /** The is h. */
180
        private Boolean isH = false;
181

    
182
        /**
183
         * Sets the h.
184
         */
185
        public void setH() {
186
                this.isH = true;
187
        }
188

    
189
        /**
190
         * Unset h.
191
         */
192
        public void unsetH() {
193
                this.isH = false;
194
        }
195

    
196
        // mark unknown words in output with an asterisk (*)
197
        
198
        /** The ism. */
199
        private Boolean ism = false;
200

    
201
        /**
202
         * Setm.
203
         */
204
        public void setm() {
205
                this.ism = true;
206
        }
207

    
208
        /**
209
         * Unsetm.
210
         */
211
        public void unsetm() {
212
                this.ism = false;
213
        }
214

    
215
        // use num-grams, num = 1, 2, 3, default = 3
216
        
217
        /** The isn. */
218
        private Boolean isn = false;
219
        
220
        /** The n. */
221
        private int n;
222

    
223
        /**
224
         * Sets the n.
225
         *
226
         * @param arg the new n
227
         */
228
        public void setn(int arg) {
229
                this.isn = true;
230
                this.n = arg;
231
        }
232

    
233
        /**
234
         * Unsetn.
235
         */
236
        public void unsetn() {
237
                this.isn = false;
238
        }
239

    
240
        // unknown word count, default = 3
241
        
242
        /** The isu. */
243
        private Boolean isu = false;
244
        
245
        /** The u. */
246
        private int u;
247

    
248
        /**
249
         * Sets the u.
250
         *
251
         * @param arg the new u
252
         */
253
        public void setu(int arg) {
254
                this.isu = true;
255
                this.u = arg;
256
        }
257

    
258
        /**
259
         * Unsetu.
260
         */
261
        public void unsetu() {
262
                this.isu = false;
263
        }
264

    
265
        // set verbosity level num: (default = 3)
266
        
267
        /** The isv. */
268
        private Boolean isv = false;
269
        
270
        /** The v. */
271
        private int v;
272

    
273
        /**
274
         * Sets the v.
275
         *
276
         * @param arg the new v
277
         */
278
        public void setv(int arg) {
279
                this.isv = true;
280
                this.v = arg;
281
        }
282

    
283
        /**
284
         * Unsetv.
285
         */
286
        public void unsetv() {
287
                this.isv = false;
288
        }
289

    
290
        // output tag if prob is in beam num, default =0 (means infinity)
291
        
292
        /** The isz. */
293
        private Boolean isz = false;
294
        
295
        /** The z. */
296
        private int z;
297

    
298
        /**
299
         * Sets the z.
300
         *
301
         * @param arg the new z
302
         */
303
        public void setz(int arg) {
304
                this.isz = true;
305
                this.z = arg;
306
        }
307

    
308
        /**
309
         * Unsetz.
310
         */
311
        public void unsetz() {
312
                this.isz = false;
313
        }
314

    
315
        // cut off path if prob is not in beam num (default 1000)
316
        
317
        /** The is z. */
318
        private Boolean isZ = false;
319
        
320
        /** The Z. */
321
        private int Z;
322

    
323
        /**
324
         * Sets the z.
325
         *
326
         * @param arg the new z
327
         */
328
        public void setZ(int arg) {
329
                this.isZ = true;
330
                this.Z = arg;
331
        }
332

    
333
        /**
334
         * Unset z.
335
         */
336
        public void unsetZ() {
337
                this.isZ = false;
338
        }
339

    
340
        // tagging. the language model is loaded from model.lex and model.123. if
341
        // model.map exists, it is used for output mapping compressed files are
342
        // recognized by suffix .gz, .bz2, or .Z
343
        
344
        /**
345
         * Tnt.
346
         *
347
         * @param model the model
348
         * @param corpus the corpus
349
         */
350
        public void tnt(String model, File corpus) throws IOException
351
        // arg : model name, will be searched in current dir or in the path
352
        // specified in the env var TNT_MODELS
353
        // arg : corpus file ?
354
        {
355
                ArrayList<String> args = new ArrayList<String>();
356
                args.add(binpath + "tnt");
357
                if (isa) {
358
                        args.add("-a");
359
                        args.add("" + a);
360
                }
361
                if (isb) {
362
                        args.add("-b");
363
                        args.add("" + b);
364
                }
365
                if (isB) {
366
                        args.add("-B");
367
                        args.add("" + B);
368
                }
369
                if (isd) {
370
                        args.add("-d");
371
                        args.add("" + d);
372
                }
373
                if (isH)
374
                        args.add("-H");
375
                if (ism)
376
                        args.add("-m");
377
                if (isn) {
378
                        args.add("-n");
379
                        args.add("" + n);
380
                }
381
                if (isu) {
382
                        args.add("-u");
383
                        args.add("" + u);
384
                }
385
                if (isv) {
386
                        args.add("-v");
387
                        args.add("" + v);
388
                }
389
                if (isz) {
390
                        args.add("-z");
391
                        args.add("" + z);
392
                }
393
                if (isZ) {
394
                        args.add("-Z");
395
                        args.add("" + Z);
396
                }
397
                args.add("" + model);
398
                args.add("" + corpus);
399

    
400
                ProcessBuilder pb = new ProcessBuilder(args);
401
                // pb.redirectErrorStream(true);
402
                Process process = null;
403
                try {
404
                        process = pb.start();
405
                } catch (IOException e) {
406
                        System.err.println(e);
407
                }
408
                InputStream is = process.getInputStream();
409
                InputStreamReader isr = new InputStreamReader(is);
410
                BufferedReader br = new BufferedReader(isr);
411
                String line;
412
                while ((line = br.readLine()) != null) {
413
                        System.out.println(line);
414
                }
415
                int e = 0;
416
                try {
417
                        e = process.waitFor();
418
                } catch (Exception err) {
419
                }
420
                if (e != 0) {
421
                        System.err.println("Process exited abnormally with code "
422
                                        + e
423
                                        + " at "
424
                                        + DateFormat.getDateInstance(DateFormat.FULL, Locale.UK)
425
                                                        .format(new Date()));
426

    
427
                        for (int c = 0; c < args.size(); c++)
428
                                System.out.print("" + args.get(c) + " ");
429
                        System.out.println();
430
                }
431
        }
432

    
433
        // print accuracy vs. frequency, max freq = max
434
        
435
        /** The isf. */
436
        private Boolean isf = false;
437
        
438
        /** The f. */
439
        private int f;
440

    
441
        /**
442
         * Sets the f.
443
         *
444
         * @param arg the new f
445
         */
446
        public void setf(int arg) {
447
                this.isf = true;
448
                this.f = arg;
449
        }
450

    
451
        /**
452
         * Unsetf.
453
         */
454
        public void unsetf() {
455
                this.isf = false;
456
        }
457

    
458
        // ignore upper/lower case of tokens
459
        
460
        /** The isi. */
461
        private Boolean isi = false;
462

    
463
        /**
464
         * Seti.
465
         */
466
        public void seti() {
467
                this.isi = true;
468
        }
469

    
470
        /**
471
         * Unseti.
472
         */
473
        public void unseti() {
474
                this.isi = false;
475
        }
476

    
477
        // lexicon to account for known/unknown words
478
        
479
        /** The isl. */
480
        private Boolean isl = false;
481
        
482
        /** The l. */
483
        private File l;
484

    
485
        /**
486
         * Sets the l.
487
         *
488
         * @param arg the new l
489
         */
490
        public void setl(File arg) {
491
                this.isl = true;
492
                this.l = arg;
493
        }
494

    
495
        /**
496
         * Unsetl.
497
         */
498
        public void unsetl() {
499
                this.isl = false;
500
        }
501

    
502
        // counting differences
503
        
504
        /**
505
         * Tntdiff.
506
         *
507
         * @param originalfile the originalfile
508
         * @param resultfile the resultfile
509
         */
510
        public void tntdiff(File originalfile, File resultfile) throws IOException
511
        // arg : model file
512
        // arg : corpus file ?
513
        {
514
                ArrayList<String> args = new ArrayList<String>();
515
                args.add(binpath + "tnt-diff");
516
                if (isa)
517
                        args.add("-a");
518
                if (isf) {
519
                        args.add("-f");
520
                        args.add("" + f);
521
                }
522
                if (isi)
523
                        args.add("-i");
524
                if (isl) {
525
                        args.add("-l");
526
                        args.add("" + l);
527
                }
528
                if (ism) {
529
                        args.add("-m");
530
                        args.add("" + m);
531
                }
532
                args.add("" + originalfile);
533
                args.add("" + resultfile);
534

    
535
                ProcessBuilder pb = new ProcessBuilder(args);
536
                pb.redirectErrorStream(true);
537
                Process process = null;
538
                try {
539
                        process = pb.start();
540
                } catch (IOException e) {
541
                        System.err.println(e);
542
                }
543
                InputStream is = process.getInputStream();
544
                InputStreamReader isr = new InputStreamReader(is);
545
                BufferedReader br = new BufferedReader(isr);
546
                String line;
547
                while ((line = br.readLine()) != null) {
548
                        System.out.println(line);
549
                }
550
                int e = 0;
551
                try {
552
                        e = process.waitFor();
553
                } catch (Exception err) {
554
                }
555
                if (e != 0) {
556
                        System.err.println("Process exited abnormally with code "
557
                                        + e
558
                                        + " at "
559
                                        + DateFormat.getDateInstance(DateFormat.FULL, Locale.UK)
560
                                                        .format(new Date()));
561

    
562
                        for (int c = 0; c < args.size(); c++)
563
                                System.out.print("" + args.get(c) + " ");
564
                        System.out.println();
565
                }
566
        }
567

    
568
        // encode capitalization in tag
569
        
570
        /** The isc. */
571
        private Boolean isc = false;
572

    
573
        /**
574
         * Setc.
575
         */
576
        public void setc() {
577
                this.isc = true;
578
        }
579

    
580
        /**
581
         * Unsetc.
582
         */
583
        public void unsetc() {
584
                this.isc = false;
585
        }
586

    
587
        // base name for output files, default=basename of corpus
588
        
589
        /** The iso. */
590
        private Boolean iso = false;
591
        
592
        /** The o. */
593
        private String o;
594

    
595
        /**
596
         * Sets the o.
597
         *
598
         * @param arg the new o
599
         */
600
        public void seto(String arg) {
601
                this.iso = true;
602
                this.o = arg;
603
        }
604

    
605
        /**
606
         * Unseto.
607
         */
608
        public void unseto() {
609
                this.iso = false;
610
        }
611

    
612
        // parameters generation
613
        
614
        /**
615
         * Tntpara.
616
         *
617
         * @param corpus the corpus
618
         */
619
        public void tntpara(File corpus) throws IOException
620
        // arg : corpus file ?
621
        {
622
                ArrayList<String> args = new ArrayList<String>();
623
                args.add(binpath + "tnt-para");
624
                if (isc)
625
                        args.add("-c");
626
                if (isH)
627
                        args.add("-H");
628
                if (isi)
629
                        args.add("-i");
630
                if (isl)
631
                        args.add("-l");
632
                if (isn)
633
                        args.add("-n");
634
                if (iso) {
635
                        args.add("-o");
636
                        args.add("" + o);
637
                }
638
                if (isv)
639
                        args.add("-v");
640
                args.add("" + corpus);
641

    
642
                ProcessBuilder pb = new ProcessBuilder(args);
643
                pb.redirectErrorStream(true);
644
                Process process = null;
645
                try {
646
                        process = pb.start();
647
                } catch (IOException e) {
648
                        System.err.println(e);
649
                }
650
                InputStream is = process.getInputStream();
651
                InputStreamReader isr = new InputStreamReader(is);
652
                BufferedReader br = new BufferedReader(isr);
653
                String line;
654
                while ((line = br.readLine()) != null) {
655
                        System.out.println(line);
656
                }
657
                int e = 0;
658
                try {
659
                        e = process.waitFor();
660
                } catch (Exception err) {
661
                }
662
                if (e != 0) {
663
                        System.err.println("Process exited abnormally with code "
664
                                        + e
665
                                        + " at "
666
                                        + DateFormat.getDateInstance(DateFormat.FULL, Locale.UK)
667
                                                        .format(new Date()));
668

    
669
                        for (int c = 0; c < args.size(); c++)
670
                                System.out.print("" + args.get(c) + " ");
671
                        System.out.println();
672
                }
673
        }
674

    
675
        // count tags
676
        
677
        /** The ist. */
678
        private Boolean ist = false;
679

    
680
        /**
681
         * Sett.
682
         */
683
        public void sett() {
684
                this.ist = true;
685
        }
686

    
687
        /**
688
         * Unsett.
689
         */
690
        public void unsett() {
691
                this.ist = false;
692
        }
693

    
694
        // count words tokens
695
        
696
        /** The isw. */
697
        private Boolean isw = false;
698

    
699
        /**
700
         * Setw.
701
         */
702
        public void setw() {
703
                this.isw = true;
704
        }
705

    
706
        /**
707
         * Unsetw.
708
         */
709
        public void unsetw() {
710
                this.isw = false;
711
        }
712

    
713
        // counting tokens and types
714
        
715
        /**
716
         * Tntwc.
717
         *
718
         * @param corpus the corpus
719
         */
720
        public void tntwc(File corpus) throws IOException
721
        // arg : corpus file ?
722
        {
723
                ArrayList<String> args = new ArrayList<String>();
724
                args.add(binpath + "tnt-wc");
725
                if (isH)
726
                        args.add("-H");
727
                if (isi)
728
                        args.add("-i");
729
                if (isl)
730
                        args.add("-l");
731
                if (ist)
732
                        args.add("-t");
733
                if (isw)
734
                        args.add("-w");
735
                args.add("" + corpus);
736

    
737
                ProcessBuilder pb = new ProcessBuilder(args);
738
                pb.redirectErrorStream(true);
739
                Process process = null;
740
                try {
741
                        process = pb.start();
742
                } catch (IOException e) {
743
                        System.err.println(e);
744
                }
745
                InputStream is = process.getInputStream();
746
                InputStreamReader isr = new InputStreamReader(is);
747
                BufferedReader br = new BufferedReader(isr);
748
                String line;
749
                while ((line = br.readLine()) != null) {
750
                        System.out.println(line);
751
                }
752
                int e = 0;
753
                try {
754
                        e = process.waitFor();
755
                } catch (Exception err) {
756
                }
757
                if (e != 0) {
758
                        System.err.println("Process exited abnormally with code "
759
                                        + e
760
                                        + " at "
761
                                        + DateFormat.getDateInstance(DateFormat.FULL, Locale.UK)
762
                                                        .format(new Date()));
763

    
764
                        for (int c = 0; c < args.size(); c++)
765
                                System.out.print("" + args.get(c) + " ");
766
                        System.out.println();
767
                }
768
        }
769

    
770
        /**
771
         * The main method.
772
         *
773
         * @param args the arguments
774
         */
775
        public static void main(String[] args) {
776
                TnT tt = new TnT("");
777
        }
778
}