Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / xtz / XTZImport.groovy @ 148

History | View | Annotate | Download (5.1 kB)

1
package org.txm.importer.xtz;
2

    
3
import org.txm.*
4
import org.txm.metadatas.Metadatas
5
import org.txm.objects.BaseParameters
6
import org.txm.utils.FileCopy
7

    
8
public class XTZImport extends ImportModule {
9

    
10
        public XTZImport(File importParametersFile) {
11
                super(importParametersFile);
12
        }
13

    
14
        public XTZImport(BaseParameters importParameters) {
15
                super(importParameters);
16
        }
17

    
18
        @Override
19
        public void init(BaseParameters p) {
20
                super.init(p);
21
                
22
                importer = new XTZImporter(this)
23
                compiler = new XTZCompiler(this)
24
                annotater = new TTAnnotater(this);
25
                pager = new XTZPager(this)
26
        }
27
        
28
        @Override
29
        protected ArrayList<File> getTXMFilesOrder() {
30
                //System.out.println("XTZ FILES ORDER");
31
                if (importer == null) {
32
                        println "no importer step, using default text order"
33
                        return super.getTXMFilesOrder();
34
                }
35
                Metadatas metadata = importer.getMetadata();
36
                if (metadata == null) {
37
                        println "no metadata, using default text order"
38
                        return super.getTXMFilesOrder();
39
                }
40
                File txmDirectory = new File(binaryDirectory, "txm/"+corpusName);
41
                ArrayList<File> files = new ArrayList<File>(Arrays.asList(txmDirectory.listFiles(new FileFilter() {
42
                        @Override
43
                        public boolean accept(File file) {
44
                                return file.isFile() && file.getName().endsWith(".xml");
45
                        }
46
                })));
47
        
48
                final HashMap<File, String> textorder = new HashMap<File, String>();
49
                for (File f : files) {
50
                        HashMap<String, String> m = metadata.getTextMetadata(f);
51
                        if (m != null && m.containsKey("textorder")) textorder[f] = m["textorder"];
52
                }
53
                println "Sorting texts using 'textorder' metadata values: "+textorder
54
                Collections.sort(files, new Comparator<File>() {
55
                        public int compare(File f1, File f2) {
56
                                String o1 = textorder[f1];
57
                                String o2 = textorder[f2];
58
                                if (o1 == null && o2 == null) {
59
                                        return f1.getName().compareTo(f2.getName());
60
                                } else if (o1 == null) {
61
                                        return 1
62
                                } else if (o2 == null) {
63
                                        return -1
64
                                } else {
65
                                        int c = o1.compareTo(o2);
66
                                        if (c == 0) return f1.getName().compareTo(f2.getName());
67
                                        else return c;
68
                                }
69
                        }
70
                });
71
                //println files
72
                return files;
73
        }
74
        
75
        public void start() throws InterruptedException {
76
                super.start();
77

    
78
                if (isSuccessful) {
79
                        
80
                        //declare a local KR
81
                        List<String> krnames = importParameters.getKnowledgeRepositoryNames();
82
                        if (krnames.size() == 0) {
83
                                importParameters.createKnowledgeRepositoryElement("DEFAULT"); // set a default KR shared by all XTZ corpus
84
                        } else if (krnames.size() == 1 && krnames.get(0).equals("DEFAULT")) {
85
                                // nothing to do
86
                        } else {
87
                                println("Corpus is using custom Knowledge repositories: "+importParameters.getKnowledgeRepositoryNames());
88
                        }
89
                        
90
                        //copy sub directories
91
                        if (isUpdatingCorpus()) {
92

    
93
                        } else {
94
                                def dirToCopy = ["xsl", "css", "dtd"]
95
                                println "--- Copying subdirectories $dirToCopy"
96
                                for (String dir : dirToCopy) {
97
                                        File origDirectory = new File(this.sourceDirectory, dir)
98
                                        if (origDirectory.exists()) {
99
                                                print "."
100
                                                File copyDirectory = new File(this.binaryDirectory, dir)
101
                                                FileCopy.copyFiles(origDirectory, copyDirectory)
102
                                        }
103
                                }
104
                                println ""
105
                        }
106
                }
107
        }
108

    
109
        public static void main(String[] args) {
110

    
111
                long start = System.currentTimeMillis()
112

    
113
                String userDir = System.getProperty("user.home")
114
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
115
                //Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows
116
                Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
117
                //Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File("C:\\Program Files\\treetagger"));//for Windows
118
                Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
119
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
120
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
121
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
122
                //Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows
123
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(userDir, "TXM"));
124

    
125
                //File importParametersFile = new File("/home/mdecorde/xml/brown-for-xtz/import.xml");
126
                //                File importParametersFile = new File("/home/mdecorde/xml/qgraalc/qgraal_cw/import.xml"); // new import
127
                //File importParametersFile = new File("/home/mdecorde/TXM/corpora/QGRAALXTZ/import.xml"); // corpus update
128
                //                File importParametersFile = new File("/home/mdecorde/xml/annotation/import.xml"); // new import
129
                //File importParametersFile = new File("/home/mdecorde/xml/baiptest/import.xml"); // new import
130
                //File importParametersFile = new File("/home/mdecorde/TXM/corpora/BAIP/import.xml"); // corpus update
131
                File importParametersFile = new File("/home/mdecorde/xml/xtzsmall/import.xml");
132
                BaseParameters b = new BaseParameters(importParametersFile);
133
                b.getKeyValueParameters().put(ImportKeys.CLEAN, "false")
134
                b.getKeyValueParameters().put(ImportKeys.MULTITHREAD, "false")
135
                b.getKeyValueParameters().put(ImportKeys.DEBUG, "true")
136
                b.getKeyValueParameters().put(ImportKeys.UPDATECORPUS, "false")
137

    
138
                XTZImport i = new XTZImport(b);
139
                i.start();
140
                
141
                println "Done: "+i.isSuccessful
142

    
143
                long end = System.currentTimeMillis()
144
                println "TIME: "+(end-start)
145
        }
146
}