Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xtz / XTZImport.groovy @ 966

History | View | Annotate | Download (5.1 kB)

1 321 mdecorde
package org.txm.importer.xtz;
2 321 mdecorde
3 479 mdecorde
import java.io.File;
4 479 mdecorde
import java.io.FileFilter;
5 479 mdecorde
import java.util.ArrayList;
6 479 mdecorde
import java.util.Arrays;
7 479 mdecorde
import java.util.Collections;
8 479 mdecorde
9 490 mdecorde
import org.eclipse.core.runtime.IProgressMonitor;
10 479 mdecorde
import org.txm.utils.xml.DomUtils;
11 321 mdecorde
import org.txm.metadatas.Metadatas
12 321 mdecorde
import org.txm.objects.BaseParameters
13 321 mdecorde
import org.txm.utils.io.FileCopy;
14 479 mdecorde
import org.txm.*
15 479 mdecorde
import org.w3c.dom.Element
16 321 mdecorde
17 321 mdecorde
public class XTZImport extends ImportModule {
18 321 mdecorde
19 321 mdecorde
        public XTZImport(File importParametersFile) {
20 321 mdecorde
                super(importParametersFile);
21 321 mdecorde
        }
22 321 mdecorde
23 321 mdecorde
        public XTZImport(BaseParameters importParameters) {
24 321 mdecorde
                super(importParameters);
25 321 mdecorde
        }
26 321 mdecorde
27 321 mdecorde
        @Override
28 321 mdecorde
        public void init(BaseParameters p) {
29 321 mdecorde
                super.init(p);
30 321 mdecorde
31 321 mdecorde
                importer = new XTZImporter(this)
32 321 mdecorde
                compiler = new XTZCompiler(this)
33 321 mdecorde
                annotater = new TTAnnotater(this);
34 321 mdecorde
                pager = new XTZPager(this)
35 321 mdecorde
        }
36 321 mdecorde
37 321 mdecorde
        @Override
38 321 mdecorde
        protected ArrayList<File> getTXMFilesOrder() {
39 321 mdecorde
                //System.out.println("XTZ FILES ORDER");
40 321 mdecorde
                if (importer == null) {
41 321 mdecorde
                        println "no importer step, using default text order"
42 321 mdecorde
                        return super.getTXMFilesOrder();
43 321 mdecorde
                }
44 321 mdecorde
                Metadatas metadata = importer.getMetadata();
45 321 mdecorde
                if (metadata == null) {
46 321 mdecorde
                        println "no metadata, using default text order"
47 321 mdecorde
                        return super.getTXMFilesOrder();
48 321 mdecorde
                }
49 321 mdecorde
                File txmDirectory = new File(binaryDirectory, "txm/"+corpusName);
50 321 mdecorde
                ArrayList<File> files = new ArrayList<File>(Arrays.asList(txmDirectory.listFiles(new FileFilter() {
51 321 mdecorde
                        @Override
52 321 mdecorde
                        public boolean accept(File file) {
53 321 mdecorde
                                return file.isFile() && file.getName().endsWith(".xml");
54 321 mdecorde
                        }
55 321 mdecorde
                })));
56 321 mdecorde
57 321 mdecorde
                final HashMap<File, String> textorder = new HashMap<File, String>();
58 321 mdecorde
                for (File f : files) {
59 321 mdecorde
                        HashMap<String, String> m = metadata.getTextMetadata(f);
60 321 mdecorde
                        if (m != null && m.containsKey("textorder")) textorder[f] = m["textorder"];
61 321 mdecorde
                }
62 321 mdecorde
                println "Sorting texts using 'textorder' metadata values: "+textorder
63 321 mdecorde
                Collections.sort(files, new Comparator<File>() {
64 321 mdecorde
                        public int compare(File f1, File f2) {
65 321 mdecorde
                                String o1 = textorder[f1];
66 321 mdecorde
                                String o2 = textorder[f2];
67 321 mdecorde
                                if (o1 == null && o2 == null) {
68 321 mdecorde
                                        return f1.getName().compareTo(f2.getName());
69 321 mdecorde
                                } else if (o1 == null) {
70 321 mdecorde
                                        return 1
71 321 mdecorde
                                } else if (o2 == null) {
72 321 mdecorde
                                        return -1
73 321 mdecorde
                                } else {
74 321 mdecorde
                                        int c = o1.compareTo(o2);
75 321 mdecorde
                                        if (c == 0) return f1.getName().compareTo(f2.getName());
76 321 mdecorde
                                        else return c;
77 321 mdecorde
                                }
78 321 mdecorde
                        }
79 321 mdecorde
                });
80 321 mdecorde
                //println files
81 321 mdecorde
                return files;
82 321 mdecorde
        }
83 321 mdecorde
84 321 mdecorde
        public void start() throws InterruptedException {
85 321 mdecorde
                super.start();
86 321 mdecorde
87 321 mdecorde
                if (isSuccessful) {
88 321 mdecorde
89 321 mdecorde
                        //declare a local KR
90 715 mdecorde
                        //TODO find out how the annotation plugin may hook the import steps
91 715 mdecorde
//                        List<String> krnames = importParameters.getKnowledgeRepositoryNames();
92 715 mdecorde
//                        if (krnames.size() == 0) {
93 715 mdecorde
//                                importParameters.createKnowledgeRepositoryElement("DEFAULT"); // set a default KR shared by all XTZ corpus
94 715 mdecorde
//                        } else if (krnames.size() == 1 && krnames.get(0).equals("DEFAULT")) {
95 715 mdecorde
//                                // nothing to do
96 715 mdecorde
//                        } else {
97 715 mdecorde
//                                println("Corpus is using custom Knowledge repositories: "+importParameters.getKnowledgeRepositoryNames());
98 715 mdecorde
//                        }
99 321 mdecorde
100 321 mdecorde
                        //copy sub directories
101 321 mdecorde
                        if (isUpdatingCorpus()) {
102 321 mdecorde
103 321 mdecorde
                        } else {
104 321 mdecorde
                                def dirToCopy = ["xsl", "css", "dtd"]
105 321 mdecorde
                                println "--- Copying subdirectories $dirToCopy"
106 321 mdecorde
                                for (String dir : dirToCopy) {
107 321 mdecorde
                                        File origDirectory = new File(this.sourceDirectory, dir)
108 321 mdecorde
                                        if (origDirectory.exists()) {
109 321 mdecorde
                                                print "."
110 321 mdecorde
                                                File copyDirectory = new File(this.binaryDirectory, dir)
111 321 mdecorde
                                                FileCopy.copyFiles(origDirectory, copyDirectory)
112 321 mdecorde
                                        }
113 321 mdecorde
                                }
114 321 mdecorde
                                println ""
115 321 mdecorde
                        }
116 321 mdecorde
                }
117 321 mdecorde
        }
118 321 mdecorde
119 321 mdecorde
        public static void main(String[] args) {
120 321 mdecorde
121 321 mdecorde
                long start = System.currentTimeMillis()
122 321 mdecorde
123 321 mdecorde
                String userDir = System.getProperty("user.home")
124 321 mdecorde
                Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
125 321 mdecorde
                //Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows
126 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
127 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
128 321 mdecorde
                Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
129 321 mdecorde
                //Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows
130 321 mdecorde
                Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(userDir, "TXM"));
131 321 mdecorde
132 321 mdecorde
                //File importParametersFile = new File("/home/mdecorde/xml/brown-for-xtz/import.xml");
133 321 mdecorde
                //                File importParametersFile = new File("/home/mdecorde/xml/qgraalc/qgraal_cw/import.xml"); // new import
134 321 mdecorde
                //File importParametersFile = new File("/home/mdecorde/TXM/corpora/QGRAALXTZ/import.xml"); // corpus update
135 321 mdecorde
                //                File importParametersFile = new File("/home/mdecorde/xml/annotation/import.xml"); // new import
136 321 mdecorde
                //File importParametersFile = new File("/home/mdecorde/xml/baiptest/import.xml"); // new import
137 321 mdecorde
                //File importParametersFile = new File("/home/mdecorde/TXM/corpora/BAIP/import.xml"); // corpus update
138 321 mdecorde
                File importParametersFile = new File("/home/mdecorde/xml/xtzsmall/import.xml");
139 321 mdecorde
                BaseParameters b = new BaseParameters(importParametersFile);
140 321 mdecorde
                b.getKeyValueParameters().put(ImportKeys.CLEAN, "false")
141 321 mdecorde
                b.getKeyValueParameters().put(ImportKeys.MULTITHREAD, "false")
142 321 mdecorde
                b.getKeyValueParameters().put(ImportKeys.DEBUG, "true")
143 321 mdecorde
                b.getKeyValueParameters().put(ImportKeys.UPDATECORPUS, "false")
144 321 mdecorde
145 321 mdecorde
                XTZImport i = new XTZImport(b);
146 321 mdecorde
                i.start();
147 321 mdecorde
148 321 mdecorde
                println "Done: "+i.isSuccessful
149 321 mdecorde
150 321 mdecorde
                long end = System.currentTimeMillis()
151 321 mdecorde
                println "TIME: "+(end-start)
152 321 mdecorde
        }
153 321 mdecorde
}