Statistics
| Revision:

ccc / projets / CMC2ELAN / src / TEICMCPerLocPerDate.groovy @ 2

History | View | Annotate | Download (8.2 kB)

1

    
2

    
3
import java.io.File;
4
import java.net.URL;
5

    
6
import javax.xml.stream.*;
7

    
8

    
9
public class TEICMCPerLocPerDate extends StaxIdentityParser {
10

    
11
        File outputDirectory;
12
        def writers = [:];
13
        def counts = [:];
14

    
15
        public TEICMCPerLocPerDate(File inputFile, File outputDirectory) {
16
                super(inputFile);
17
                this.outputDirectory = outputDirectory;
18
                outputDirectory.mkdir()
19
        }
20

    
21
        boolean inName = false;
22
        String filenameValue = "";
23
        boolean inDate = false;
24
        String dateValue = "";
25
        boolean inTime = false;
26
        String timeValue = "";
27
        boolean inPosting = false;
28
        String postingID = ""
29
        String postingWho = ""
30
        boolean inP;
31
        String content = ""
32

    
33
        protected void processStartElement() {
34
                if (parser.getLocalName() == "posting") {
35
                        inPosting = true
36
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
37
                                if (parser.getAttributeLocalName(i) == "who") {
38
                                        postingWho = parser.getAttributeValue(i)
39
                                } else if (parser.getAttributeLocalName(i) == "id") {
40
                                        postingID = parser.getAttributeValue(i)
41
                                }
42
                        }
43
                        return;
44
                } else if(inPosting) {
45
                        if ( parser.getLocalName() == "dateline") {
46
                        } else if (parser.getLocalName() == "name") {
47
                                inName = true
48
                                filenameValue = ""
49
                        } else if (parser.getLocalName() == "date") {
50
                                inDate = true;
51
                                dateValue = ""
52
                        } else if (parser.getLocalName() == "time") {
53
                                inTime = true;
54
                                timeValue = ""
55
                        } else if (parser.getLocalName() == "p") {
56
                                inP = true;
57
                                content = ""
58
                        }
59
                        return;
60
                }
61

    
62
                if (writer != null) {
63
                        processStartElement(writer);
64
                } else {
65
                        for (def swriter : writers.values()) processStartElement(swriter);
66
                }
67
        }
68

    
69
        protected void processCharacters() {
70
                if (inPosting) {
71
                        if (inName) {
72
                                filenameValue += parser.getText();
73
                                return;
74
                        } else if (inDate) {
75
                                dateValue += parser.getText();
76
                                return;
77
                        } else if (inTime) {
78
                                timeValue += parser.getText();
79
                                return;
80
                        } else if (inP) {
81
                                content += parser.getText();
82
                                return;
83
                        }
84
                }
85

    
86
                if (writer != null) {
87
                        writer.writeCharacters(parser.getText());
88
                } else {
89
                        for (def swriter : writers.values()) swriter.writeCharacters(parser.getText());
90
                }
91
        }
92

    
93
        protected void processEndElement()
94
        {
95
                if (inPosting) {
96
                        if (inPosting && parser.getLocalName() == "name") {
97
                                inName = false;
98
                        } else if (parser.getLocalName() == "date") {
99
                                inDate = false;
100
                        } else if (parser.getLocalName() == "p") {
101
                                inP = false;
102
                        } else if (parser.getLocalName() == "time") {
103
                                inTime = false;
104
                                writer = getWriter(dateValue+postingWho)
105
                        } else if (parser.getLocalName() == "posting") {
106
                                if (writer == null) {
107
                                        println "Error Houston !!! with loc=$postingWho date=$dateValue at="parser.getLocation()
108
                                        return;
109
                                }
110
                                // write Turn
111
                                //println "write Turn"
112
                                writer.writeStartElement("posting");
113
                                writer.writeAttribute("xml:id",postingID);
114
                                writer.writeAttribute("who",postingWho);
115
                                writer.writeStartElement("dateline");
116
                                writer.writeStartElement("name");
117
                                writer.writeAttribute("type","file");
118
                                writer.writeCharacters(filenameValue)
119
                                writer.writeEndElement(); //name
120
                                writer.writeStartElement("date");
121
                                writer.writeCharacters(dateValue)
122
                                writer.writeEndElement(); //date
123
                                writer.writeStartElement("time");
124
                                writer.writeCharacters(timeValue)
125
                                writer.writeEndElement(); //time
126
                                writer.writeEndElement(); //dateline
127
                                writer.writeStartElement("p");
128
                                writer.writeCharacters(content)
129
                                writer.writeEndElement(); //p
130
                                writer.writeEndElement(); //posting // closed in the next lines
131
                                //println "end write Turn"
132
                                inPosting = false
133
                                writer = null;
134
                        }
135
                        return;
136
                }
137

    
138
                if (writer != null) {
139
                        writer.writeEndElement();
140
                } else {
141
                        for (def swriter : writers.values()) swriter.writeEndElement();
142
                }
143
        }
144

    
145

    
146
        protected void processStartElement(def swriter)
147
        {
148
                String prefix = parser.getPrefix();
149

    
150
                if (prefix != null && prefix.length() > 0)
151
                        swriter.writeStartElement(Nscontext.getNamespaceURI(prefix), localname)
152
                else
153
                        swriter.writeStartElement(localname);
154

    
155
                for (int i = 0 ; i < parser.getNamespaceCount() ; i++) {
156
                        swriter.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
157
                }
158

    
159
                writeAttributes(swriter);
160
        }
161

    
162
        private void _processStartElement() {
163
                String prefix = parser.getPrefix();
164

    
165
                if (prefix != null && prefix.length() > 0)
166
                        writer.writeStartElement(Nscontext.getNamespaceURI(prefix), localname)
167
                else
168
                        writer.writeStartElement(localname);
169

    
170
                for (int i = 0 ; i < parser.getNamespaceCount() ; i++) {
171
                        writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
172
                }
173

    
174
                writeAttributes();
175
        }
176

    
177

    
178
        protected void processNamespace() {
179
                if (writer != null) {
180
                        writer.writeNamespace(parser.getPrefix(), parser.getNamespaceURI());
181
                } else {
182
                        for (def swriter : writers.values()) swriter.writeNamespace(parser.getPrefix(), parser.getNamespaceURI());
183
                }
184
        }
185

    
186
        protected void writeAttributes() {
187
                if (writer != null) {
188
                        writeAttributes(writer);
189
                } else {
190
                        for (def swriter : writers.values()) writeAttributes(swriter);
191
                }
192
        }
193

    
194
        protected void writeAttributes(def swriter) {
195
                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
196
                        String attrPrefix = parser.getAttributePrefix(i);
197
                        if (attrPrefix != null && attrPrefix.length() > 0)
198
                                swriter.writeAttribute(attrPrefix+":"+parser.getAttributeLocalName(i), parser.getAttributeValue(i));
199
                        else
200
                                swriter.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
201
                }
202
        }
203

    
204
        protected void processProcessingInstruction() {
205
                if (writer != null) {
206
                        writer.writeProcessingInstruction(parser.getPITarget(), parser.getPIData());
207
                } else {
208
                        for (def swriter : writers.values()) swriter.writeProcessingInstruction(parser.getPITarget(), parser.getPIData());
209
                }
210
        }
211

    
212
        protected void processDTD()
213
        {
214
                if (writer != null) {
215
                        writer.writeDTD(parser.getText());
216
                } else {
217
                        for (def swriter : writers.values()) swriter.writeDTD(parser.getText());
218
                }
219
        }
220

    
221
        protected void processCDATA()
222
        {
223
                if (writer != null) {
224
                        writer.writeCData(parser.getText())
225
                } else {
226
                        for (def swriter : writers.values()) swriter.writeCData(parser.getText())
227
                }
228
        }
229

    
230
        protected void processComment()
231
        {
232
                if (writer != null) {
233
                        writer.writeComment(parser.getText());
234
                } else {
235
                        for (def swriter : writers.values()) swriter.writeComment(parser.getText());
236
                }
237
        }
238

    
239
        protected getWriter(String key) {
240
                def w = writers.get(key)
241
                counts.put(key, counts.get(key)+1)
242
                return w
243
        }
244

    
245
        protected void processEndDocument() {
246
                if (writer != null) {
247
                        writer.writeEndDocument();
248
                } else {
249
                        for (def swriter : writers.values()) swriter.writeEndDocument();
250
                }
251
        }
252

    
253
        protected void processEntityReference() {
254
                if (writer != null) {
255
                        writer.writeEntityRef(parser.getLocalName());
256
                } else {
257
                        for (def swriter : writers.values()) swriter.writeEntityRef(parser.getLocalName());
258
                }
259
        }
260

    
261
        public boolean processDatesAndLocs(def dates, def locutors) {
262
                for (String date : dates) {
263
                        for (String loc : locutors) {
264
                                println "Create writer for $loc"
265
                                writers.put(date+loc, new StaxStackWriter(new File(outputDirectory, "${loc}-${date}-teicmr.xml")))
266
                                counts.put(date+loc, 0)
267
                        }
268
                }
269
                println "writers: $writers"
270

    
271
                for (def swriter : writers.values()) {
272
                        swriter.writeStartDocument("UTF-8", "1.0");
273
                        swriter.writeCharacters("\n");
274
                }
275

    
276
                boolean ret = process(writer);
277

    
278
                for (def swriter : writers.values()) {
279
                        try {swriter.close();} catch(Exception e){println "close writer exep: "+e}
280
                }
281

    
282
                if (parser != null)
283
                        try {parser.close()} catch(Exception e){println "parser exep: "+e}
284

    
285

    
286
                //delete file (date+loc) not used
287
                for (def key : counts.keySet()) {
288
                        if (counts.get(key) == 0) { // never used for a couple date+loc
289
                                (writers.get(key)).getInfile().delete()
290
                        }
291
                }
292

    
293
                return ret;
294
        }
295

    
296
        public static void main(String[] args) {
297
                File inputFile = new File("/home/mdecorde/xml/comere/ismael-textchat.xml")
298
                File outputDirectory = new File("/home/mdecorde/xml/comere/split_out")
299
                outputDirectory.deleteDir()
300
                outputDirectory.mkdir()
301

    
302
                XPathResult xpathProcessor = new XPathResult(inputFile);
303
                def dates = xpathProcessor.getXpathResponses("//tei:dateline/tei:date/text()")
304
                def locs = xpathProcessor.getXpathResponses("//tei:listPerson/tei:person/@xml:id")
305

    
306
                dates = new HashSet(dates)
307
                dates = new ArrayList(dates)
308
                println dates.sort()
309
                println locs
310

    
311
                def p = new TEICMCPerLocPerDate(inputFile, outputDirectory)
312
                println p.processDatesAndLocs(dates, locs)
313
        }
314
}