Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / transcriber / TranscriberTokenizer.groovy @ 479

History | View | Annotate | Download (4.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.importer.transcriber
29

    
30
import filters.Tokeniser.SimpleTokenizerXml;
31

    
32
/**
33
 * The Class TranscriberTokenizer : manages Transcriber notations
34
 */
35
class TranscriberTokenizer extends SimpleTokenizerXml{
36
        
37
        /** The troncature. */
38
        boolean troncature = false;
39
        
40
        /** The audio. */
41
        String audio ="F";
42
        
43
        /** The notation. */
44
        String notation = "";
45
        
46
        /** paroles rapportees */
47
        boolean rapp = false;
48
        
49
        /** The event. */
50
        String event="";
51
        String txtname;
52
        
53
        /**
54
         * Instantiates a new transcriber tokenizer.
55
         *
56
         * @param infile the infile
57
         * @param outfile the outfile
58
         */
59
        public TranscriberTokenizer(File infile, File outfile)
60
        {
61
                super(infile, outfile, null)
62
                txtname = infile.getName();
63
                int idx = txtname.lastIndexOf(".")
64
                if (idx > 0) txtname = txtname.substring(0, idx);
65
        }
66
        
67
        /**
68
         * Instantiates a new transcriber tokenizer.
69
         *
70
         * @param infile the infile
71
         * @param outfile the outfile
72
         */
73
        public TranscriberTokenizer(File infile, File outfile, String lang)
74
        {
75
                super(infile, outfile, lang)
76
                txtname = infile.getName();
77
                int idx = txtname.lastIndexOf(".")
78
                if (idx > 0) txtname = txtname.substring(0, idx);
79
        }
80
        
81
        /* (non-Javadoc)
82
         * @see filters.Tokeniser.SimpleTokenizerXml#processWord()
83
         */
84
        protected void processWord()
85
        {
86
                //println " chars: "+parser.getText().trim();
87
                String text = buffer.toString().trim().replace("\t", " ");
88
                text = text.replaceAll("\\p{C}", "");
89
                String previousChar = "";
90
                for (String s : text.split("[\\p{Z}\\p{C}]+") )        
91
                {
92
                        //init transcriber attributes
93
                        if (troncature)
94
                                audio = "absent";
95
                        else
96
                                audio = "present"
97
                        notation = s;
98
                        event = "";
99
                        
100
                        if (s.startsWith("\"")) {
101
                                rapp = true;
102
                                event += "#rapp1";
103
                        } else if(s.endsWith("\"")) {
104
                                rapp = false;
105
                                event += "#rapp2";
106
                        }
107

    
108
                        //test events
109
                        if (s.startsWith("^^")) {
110
                                event += "#orth";                        
111
                                s = s.substring(2);
112
                        }
113
                        if (s.startsWith("*")) {
114
                                event += "#corr";
115
                                s = s.substring(1);
116
                        }
117
                        
118
                        if (event.length() > 0)
119
                                event = event.substring(1);//remove '|'
120
                        
121
                        // test audio
122
                        if (s.contains("(") || s.contains(")")) // contains ( or )
123
                        {
124
                                if (s.contains("(") ^ s.contains(")")) // ert(ert XOR ert)ert
125
                                {
126
                                        audio = "partiel"
127
                                        
128
                                        if (s.contains("(")) // ert(ert
129
                                        {
130
                                                troncature = true;
131
                                                if (s.startsWith("(")) // (ertert
132
                                                {
133
                                                        audio = "absent";
134
                                                }
135
                                        } else // ert)ert
136
                                        {
137
                                                troncature = false;
138
                                                if (s.endsWith(")")) // ertert)
139
                                                {
140
                                                        audio = "absent";
141
                                                }
142
                                        }
143
                                }
144
                                else if (s.contains("(") && s.contains(")")) // ert(ert)ert OR ert)ert(ert
145
                                {
146
                                        audio = "partiel";
147
                                        if (s.startsWith("(") && s.endsWith(")")) // (ertert)
148
                                        {
149
                                                audio = "absent";
150
                                                troncature = false;
151
                                        } else if (s.indexOf("(") < s.indexOf(")")) // ert(ert)ert
152
                                        {
153
                                                troncature = false;
154
                                        } else // ert)ert(ert
155
                                        {
156
                                                troncature = true;
157
                                        }
158
                                }
159
                                s = s.replace("(","");
160
                                s = s.replace(")","");
161
                        }
162
                                                
163
                        iterate(s);
164
                }
165
        }
166
        
167
        /**
168
         * Iterate.
169
         *
170
         * @param s the s
171
         * @return the java.lang. object
172
         */
173
        protected iterate(String s)
174
        {
175
                while (s != null && s.length() > 0) {
176
                        if (DEBUG) {println "  > $s"}
177
                        s = standardChecks(s);
178
                        event = "";
179
                }
180
        }
181
        
182
        /* (non-Javadoc)
183
         * @see filters.Tokeniser.SimpleTokenizerXml#writeWordAttributes()
184
         */
185
        protected writeWordAttributes()
186
        {
187
                writer.writeAttribute("id","w_"+txtname+"_"+wordcount);
188
                writer.writeAttribute("audio", audio);
189
                writer.writeAttribute("event", event);
190
                writer.writeAttribute("notation", notation);
191
        }
192
}