Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / bfm / importer.groovy @ 479

History | View | Annotate | Download (9.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-05-11 08:38:47 +0200 (mer. 11 mai 2016) $
25
// $LastChangedRevision: 3211 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.bfm;
29

    
30
import org.txm.importer.*;
31
import org.txm.importer.filters.*;
32
import org.txm.scripts.*;
33
import org.txm.scripts.teitxm.*;
34
import org.txm.utils.io.FileCopy;
35
import org.txm.utils.treetagger.TreeTagger;
36

    
37
import javax.xml.stream.*;
38

    
39
import java.io.File;
40
import java.net.URL;
41
import java.util.Properties;
42

    
43
import org.txm.*;
44

    
45
import filters.CutHeader.*;
46
import filters.ReunitBrokenWords.*;
47
import filters.Tokeniser.*;
48
import filters.TagSentences.*;
49
import filters.FusionHeader.*;
50

    
51
// TODO: Auto-generated Javadoc
52
/**
53
 * Convert TEI-BFM to TEI-TXM: <br/>
54
 * 1- tokenize : manage BFM's &lt;choice> policy (see <a
55
 * href="http://bfm.ens-lsh.fr">BFM website</a>).<br/>
56
 * 2- launch XML2TEITXM.groovy with no annotation to import. <br/>
57
 *
58
 * @param dir contains the sources files
59
 * @param basename the basename
60
 * @return true, if successful
61
 * @author mdecorde
62
 */
63
class importer {
64
        
65
        public boolean run(File srcDir, File binDir, File txmDir, String basename, Properties metadataXPath)
66
        {
67
                new File(binDir, "ptokenized").deleteDir();
68
                new File(binDir, "ptokenized").mkdir();
69
                new File(binDir, "tokenized").deleteDir();
70
                new File(binDir, "tokenized").mkdir();
71
                new File(binDir, "headers").deleteDir();
72
                new File(binDir, "headers").mkdir();
73

    
74
                ArrayList<String> milestones = new ArrayList<String>();
75
                milestones.add("lb");
76
                milestones.add("pb");
77
                milestones.add("milestone");
78

    
79
                List<File> files = txmDir.listFiles();
80
                if (files == null || files.size() == 0) return false;
81

    
82
                for (int i = 0 ; i < files.size() ; i++)
83
                        if (!(files.get(i).getName().endsWith(".xml"))
84
                        || files.get(i).getName().equals("import.xml"))
85
                                files.remove(i--);
86

    
87
                for (File f : files) {
88
                        //System.out.println("Get metadata from file $xpathfile");
89
                        HashMap<String, String> metadataValues = new HashMap<String, String>();
90
                        if (metadataXPath != null) {
91
                                def xpathprocessor = new XPathResult(f)
92
                                for (String name : metadataXPath.keySet()) {
93
                                        String value = xpathprocessor.getXpathResponse(metadataXPath.get(name), "N/A");
94
                                        value = value.trim().replace("\n", " ");
95
                                        //println "xpath: "+metadataXPath.get(name)+" >> "+value.trim()
96
                                        metadataValues.put(name, value);
97
                                }
98
                                xpathprocessor.close();
99
                        }
100
                        
101
                        //forme is mandatory !!!
102
                        String forme = "prose";
103
                        if (metadataValues.containsKey("forme")) {
104
                                String tmp = metadataValues.get("forme");
105
                                if (tmp.startsWith("#forme_"))
106
                                        tmp = tmp.substring(7)
107
                                metadataValues.put("forme", tmp)
108
                                forme = metadataValues.get("forme");
109
                        }
110

    
111
                        if (metadataValues.containsKey("morphosynt")) {
112
                                //println "MORPHOSYNT: "+metadataValues.get("morphosynt");
113
                                if (metadataValues.get("morphosynt").equals("N/A"))
114
                                        metadataValues.put("morphosynt","non vérifié");
115
                                else
116
                                        metadataValues.put("morphosynt","vérifié");
117
                        }
118

    
119
                        String txtname = f.getName().substring(0, f.getName().length()-4);
120
                        String titreId = txtname;
121
                        if (metadataValues.containsKey("sigle")) {
122
                                titreId = metadataValues.get("sigle");
123
                        } else if (metadataValues.containsKey("idbfm")) {
124
                                titreId = metadataValues.get("idbfm");
125
                        }
126
                        metadataValues.put("sigle", titreId);
127
                        
128
                        // inject attributes in $f
129
                        AddAttributeInXml builder = new AddAttributeInXml(f, "text", metadataValues);
130
                        builder.onlyOneElement();
131
                        //println "INJECTING $metadataValues"
132
                        File outfile = new File(txmDir, "copy"+f.getName())
133
                        
134
                        if (!builder.process(outfile)) {
135
                                println "Error while injecting metadata in $f"
136
                                return false;
137
                        }
138
                        builder= null;
139
                        if (f.delete()) {
140
                                outfile.renameTo(f)
141
                        } else {
142
                                println "ERROR could not delete $f for metadatainjection"
143
                                return false
144
                        }
145
                }
146

    
147
                files = txmDir.listFiles();
148
                if (files == null || files.size() == 0) return false;
149
                
150
                //return false; // TEMP TEMP TEMP
151
                
152
                //PREPARE EACH FILE TO BE TOKENIZED
153
                println("preparing "+files.size()+" files for the tokenizer");
154

    
155
                for (File f : files) {
156
                        if (f.isDirectory()) continue;
157
                        print "."//+f.getAbsolutePath()
158
                        File srcfile = f;
159
                        File resultfile = new File(binDir,"ptokenized/"+f.getName());
160

    
161
                        def builder2 = new OneTagPerLine(srcfile.toURI().toURL(), milestones);
162
                        if (!builder2.process(resultfile)) {
163
                                println "Failed to tokenize file: "+f;
164
                                resultfile.delete();
165
                        }
166
                }
167
                println ""
168

    
169
                //TOKENIZE FILES
170
                files = new File(binDir, "ptokenized").listFiles();//scan directory split
171
                println("Tokenizing "+files.size()+" files")
172
                for (File infile : files) {
173
                        print "."
174
                        try {
175
                                Sequence S = new Sequence();
176
                                Filter F1 = new CutHeader();
177
                                Filter F2 = new ReunitBrokenWords();
178
                                Filter F6 = new Tokeniser(infile);
179
                                Filter F7 = new TagSentences();
180
                                Filter F11 = new FusionHeader();
181
                                S.add(F1);
182
                                //S.add(F2);
183
                                S.add(F6);
184
                                //S.add(F7);
185
                                S.add(F11);
186

    
187
                                File xmlfile = new File(binDir,"tokenized/"+infile.getName());
188

    
189
                                S.SetInFileAndOutFile(infile.getAbsolutePath(), xmlfile.getAbsolutePath());
190
                                S.setEncodages("UTF-8","UTF-8");
191

    
192
                                Object[] arguments1 = [new File(binDir,"/headers/"+infile.getName()+"header.xml").getAbsolutePath()];
193
                                F1.SetUsedParam(arguments1);
194
                                Object[] arguments2 = [new File(binDir,"/headers/"+infile.getName()+"header.xml").getAbsolutePath(),F1];
195
                                F11.SetUsedParam(arguments2);
196

    
197
                                S.proceed();
198
                                S.clean();
199
                        } catch(Exception e) {
200
                                org.txm.utils.logger.Log.printStackTrace(e);
201
                                return false;
202
                        }
203
                }
204
                println ""
205

    
206

    
207
                files = new File(binDir,"tokenized").listFiles()
208
                println "Validating XML of "+files.size()+" files"
209
                for (File f : files) {
210
                        print "."
211
                        if (!ValidateXml.test(f)) {
212
                                System.out.println("XML NOT VALID "+f);
213
                                new File(binDir, "tokenizerError").mkdir();
214
                                FileCopy.copy(f,new File(binDir, "tokenizerError/"+f.getName()));
215
                                f.delete();
216
                        }
217
                }
218
                println("");
219

    
220
                //TRANSFORM INTO XML-TEI-TXM
221
                files = new File(binDir, "tokenized").listFiles()
222
                println("Building xml-tei-txm "+files.size()+ " files")
223
                if (files.size() == 0) return false;
224
                for (File file : files) {
225
                        print "."
226
                        String txmfile = file.getName();
227

    
228
                        def correspType = new HashMap<String,String>()
229
                        def correspRef = new HashMap<String,String>()
230

    
231
                        //il faut lister les id de tous les respStmt
232
                        def respId = [];
233
                        //fait la correspondance entre le respId et le rapport d'execution de l'outil
234
                        def applications = new HashMap<String,HashMap<String,String>>();
235
                        //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
236
                        //pour construire les ref vers les taxonomies
237
                        def taxonomiesUtilisees = new HashMap<String,String[]>();
238
                        //associe un id d'item avec sa description et son URI
239
                        def itemsURI = new HashMap<String,HashMap<String,String>>();
240
                        //informations de respStmt
241
                        //resps (respId <voir ci-dessus>, [description, person, date])
242
                        def resps = new HashMap<String,String[]>();
243

    
244
                        //lance le traitement
245
                        def builder3 = new Xml2Ana(file);
246
                        builder3.setCorrespondances(correspRef, correspType);
247
                        builder3.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
248
                        if (!builder3.process(new File(txmDir,txmfile))) {
249
                                println "Failed to build xml-txm file from file: "+file;
250
                                return false;
251
                        }
252
                }
253
                println("")
254
                return true;
255
        }
256

    
257
        /**
258
         * for test purpose.
259
         *
260
         * @param args the arguments
261
         */
262
        public static void main(String[] args) {
263
                long start = System.currentTimeMillis();
264
                File dir = new File("/home/mdecorde/xml/qgraal")
265
                File srcfile = new File(dir, "qgraal_cm-detoken.xml");
266
                File resultfile = new File(dir,"QGRAALFRO-pt1.xml");
267
                File xmlfile = new File(dir,"QGRAALFRO-t1.xml");
268

    
269
                def builder2 = new OneTagPerLine(srcfile.toURI().toURL(), []);
270
                if (!builder2.process(resultfile)) {
271
                        println "Failed to tokenize file: "+srcfile;
272
                        resultfile.delete();
273
                }
274

    
275
                try {
276
                        Sequence S = new Sequence();
277
                        Filter F1 = new CutHeader();
278
                        Filter F2 = new ReunitBrokenWords();
279
                        Filter F6 = new Tokeniser(resultfile);
280
                        Filter F7 = new TagSentences();
281
                        Filter F11 = new FusionHeader();
282
                        S.add(F1);
283
                        //S.add(F2);
284
                        S.add(F6);
285
                        //S.add(F7);
286
                        //S.add(F11);
287

    
288
                        S.SetInFileAndOutFile(resultfile.getAbsolutePath(), xmlfile.getAbsolutePath());
289
                        S.setEncodages("UTF-8","UTF-8");
290

    
291
                        Object[] arguments1 = [new File(dir,"header.xml").getAbsolutePath()];
292
                        F1.SetUsedParam(arguments1);
293
                        Object[] arguments2 = [new File(dir,"header.xml").getAbsolutePath(),F1];
294
                        F11.SetUsedParam(arguments2);
295

    
296
                        S.proceed();
297
                        S.clean();
298
                } catch(Exception e) {
299
                        org.txm.utils.logger.Log.printStackTrace(e);
300
                }
301
                println "time: "+(System.currentTimeMillis()-start)/1000
302
        }
303
}