root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / xtz / XTZImport.groovy @ 715
History | View | Annotate | Download (5.4 kB)
1 | 321 | mdecorde | package org.txm.importer.xtz;
|
---|---|---|---|
2 | 321 | mdecorde | |
3 | 479 | mdecorde | import java.io.File; |
4 | 479 | mdecorde | import java.io.FileFilter; |
5 | 479 | mdecorde | import java.util.ArrayList; |
6 | 479 | mdecorde | import java.util.Arrays; |
7 | 479 | mdecorde | import java.util.Collections; |
8 | 479 | mdecorde | |
9 | 490 | mdecorde | import org.eclipse.core.runtime.IProgressMonitor; |
10 | 479 | mdecorde | import org.txm.utils.xml.DomUtils; |
11 | 321 | mdecorde | import org.txm.metadatas.Metadatas |
12 | 321 | mdecorde | import org.txm.objects.BaseParameters |
13 | 321 | mdecorde | import org.txm.utils.io.FileCopy; |
14 | 479 | mdecorde | import org.txm.* |
15 | 479 | mdecorde | import org.w3c.dom.Element |
16 | 321 | mdecorde | |
17 | 321 | mdecorde | public class XTZImport extends ImportModule { |
18 | 321 | mdecorde | |
19 | 321 | mdecorde | public XTZImport(File importParametersFile) { |
20 | 321 | mdecorde | super(importParametersFile);
|
21 | 321 | mdecorde | } |
22 | 321 | mdecorde | |
23 | 321 | mdecorde | public XTZImport(BaseParameters importParameters) {
|
24 | 321 | mdecorde | super(importParameters);
|
25 | 321 | mdecorde | } |
26 | 321 | mdecorde | |
27 | 321 | mdecorde | @Override
|
28 | 321 | mdecorde | public void init(BaseParameters p) { |
29 | 321 | mdecorde | super.init(p);
|
30 | 321 | mdecorde | |
31 | 321 | mdecorde | importer = new XTZImporter(this) |
32 | 321 | mdecorde | compiler = new XTZCompiler(this) |
33 | 321 | mdecorde | annotater = new TTAnnotater(this); |
34 | 321 | mdecorde | pager = new XTZPager(this) |
35 | 321 | mdecorde | } |
36 | 321 | mdecorde | |
37 | 321 | mdecorde | @Override
|
38 | 321 | mdecorde | protected ArrayList<File> getTXMFilesOrder() { |
39 | 321 | mdecorde | //System.out.println("XTZ FILES ORDER");
|
40 | 321 | mdecorde | if (importer == null) { |
41 | 321 | mdecorde | println "no importer step, using default text order"
|
42 | 321 | mdecorde | return super.getTXMFilesOrder(); |
43 | 321 | mdecorde | } |
44 | 321 | mdecorde | Metadatas metadata = importer.getMetadata(); |
45 | 321 | mdecorde | if (metadata == null) { |
46 | 321 | mdecorde | println "no metadata, using default text order"
|
47 | 321 | mdecorde | return super.getTXMFilesOrder(); |
48 | 321 | mdecorde | } |
49 | 321 | mdecorde | File txmDirectory = new File(binaryDirectory, "txm/"+corpusName); |
50 | 321 | mdecorde | ArrayList<File> files = new ArrayList<File>(Arrays.asList(txmDirectory.listFiles(new FileFilter() { |
51 | 321 | mdecorde | @Override
|
52 | 321 | mdecorde | public boolean accept(File file) { |
53 | 321 | mdecorde | return file.isFile() && file.getName().endsWith(".xml"); |
54 | 321 | mdecorde | } |
55 | 321 | mdecorde | }))); |
56 | 321 | mdecorde | |
57 | 321 | mdecorde | final HashMap<File, String> textorder = new HashMap<File, String>(); |
58 | 321 | mdecorde | for (File f : files) { |
59 | 321 | mdecorde | HashMap<String, String> m = metadata.getTextMetadata(f); |
60 | 321 | mdecorde | if (m != null && m.containsKey("textorder")) textorder[f] = m["textorder"]; |
61 | 321 | mdecorde | } |
62 | 321 | mdecorde | println "Sorting texts using 'textorder' metadata values: "+textorder
|
63 | 321 | mdecorde | Collections.sort(files, new Comparator<File>() { |
64 | 321 | mdecorde | public int compare(File f1, File f2) { |
65 | 321 | mdecorde | String o1 = textorder[f1];
|
66 | 321 | mdecorde | String o2 = textorder[f2];
|
67 | 321 | mdecorde | if (o1 == null && o2 == null) { |
68 | 321 | mdecorde | return f1.getName().compareTo(f2.getName());
|
69 | 321 | mdecorde | } else if (o1 == null) { |
70 | 321 | mdecorde | return 1 |
71 | 321 | mdecorde | } else if (o2 == null) { |
72 | 321 | mdecorde | return -1 |
73 | 321 | mdecorde | } else {
|
74 | 321 | mdecorde | int c = o1.compareTo(o2);
|
75 | 321 | mdecorde | if (c == 0) return f1.getName().compareTo(f2.getName()); |
76 | 321 | mdecorde | else return c; |
77 | 321 | mdecorde | } |
78 | 321 | mdecorde | } |
79 | 321 | mdecorde | }); |
80 | 321 | mdecorde | //println files
|
81 | 321 | mdecorde | return files;
|
82 | 321 | mdecorde | } |
83 | 321 | mdecorde | |
84 | 321 | mdecorde | public void start() throws InterruptedException { |
85 | 321 | mdecorde | super.start();
|
86 | 321 | mdecorde | |
87 | 321 | mdecorde | if (isSuccessful) {
|
88 | 321 | mdecorde | |
89 | 321 | mdecorde | //declare a local KR
|
90 | 715 | mdecorde | //TODO find out how the annotation plugin may hook the import steps
|
91 | 715 | mdecorde | // List<String> krnames = importParameters.getKnowledgeRepositoryNames();
|
92 | 715 | mdecorde | // if (krnames.size() == 0) {
|
93 | 715 | mdecorde | // importParameters.createKnowledgeRepositoryElement("DEFAULT"); // set a default KR shared by all XTZ corpus
|
94 | 715 | mdecorde | // } else if (krnames.size() == 1 && krnames.get(0).equals("DEFAULT")) {
|
95 | 715 | mdecorde | // // nothing to do
|
96 | 715 | mdecorde | // } else {
|
97 | 715 | mdecorde | // println("Corpus is using custom Knowledge repositories: "+importParameters.getKnowledgeRepositoryNames());
|
98 | 715 | mdecorde | // }
|
99 | 321 | mdecorde | |
100 | 321 | mdecorde | //copy sub directories
|
101 | 321 | mdecorde | if (isUpdatingCorpus()) {
|
102 | 321 | mdecorde | |
103 | 321 | mdecorde | } else {
|
104 | 321 | mdecorde | def dirToCopy = ["xsl", "css", "dtd"] |
105 | 321 | mdecorde | println "--- Copying subdirectories $dirToCopy"
|
106 | 321 | mdecorde | for (String dir : dirToCopy) { |
107 | 321 | mdecorde | File origDirectory = new File(this.sourceDirectory, dir) |
108 | 321 | mdecorde | if (origDirectory.exists()) {
|
109 | 321 | mdecorde | print "."
|
110 | 321 | mdecorde | File copyDirectory = new File(this.binaryDirectory, dir) |
111 | 321 | mdecorde | FileCopy.copyFiles(origDirectory, copyDirectory) |
112 | 321 | mdecorde | } |
113 | 321 | mdecorde | } |
114 | 321 | mdecorde | println ""
|
115 | 321 | mdecorde | } |
116 | 321 | mdecorde | } |
117 | 321 | mdecorde | } |
118 | 321 | mdecorde | |
119 | 321 | mdecorde | public static void main(String[] args) { |
120 | 321 | mdecorde | |
121 | 321 | mdecorde | long start = System.currentTimeMillis() |
122 | 321 | mdecorde | |
123 | 321 | mdecorde | String userDir = System.getProperty("user.home") |
124 | 321 | mdecorde | Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM")); |
125 | 321 | mdecorde | //Toolbox.setParam(Toolbox.INSTALL_DIR,new File("C:\\Program Files\\TXM"));//For Windows
|
126 | 321 | mdecorde | Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger")); |
127 | 321 | mdecorde | //Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File("C:\\Program Files\\treetagger"));//for Windows
|
128 | 321 | mdecorde | Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models")); |
129 | 321 | mdecorde | Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
|
130 | 321 | mdecorde | Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
|
131 | 321 | mdecorde | Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
|
132 | 321 | mdecorde | //Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File("C:\\Program Files\\treetagger\\models"));//for Windows
|
133 | 321 | mdecorde | Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(userDir, "TXM")); |
134 | 321 | mdecorde | |
135 | 321 | mdecorde | //File importParametersFile = new File("/home/mdecorde/xml/brown-for-xtz/import.xml");
|
136 | 321 | mdecorde | // File importParametersFile = new File("/home/mdecorde/xml/qgraalc/qgraal_cw/import.xml"); // new import
|
137 | 321 | mdecorde | //File importParametersFile = new File("/home/mdecorde/TXM/corpora/QGRAALXTZ/import.xml"); // corpus update
|
138 | 321 | mdecorde | // File importParametersFile = new File("/home/mdecorde/xml/annotation/import.xml"); // new import
|
139 | 321 | mdecorde | //File importParametersFile = new File("/home/mdecorde/xml/baiptest/import.xml"); // new import
|
140 | 321 | mdecorde | //File importParametersFile = new File("/home/mdecorde/TXM/corpora/BAIP/import.xml"); // corpus update
|
141 | 321 | mdecorde | File importParametersFile = new File("/home/mdecorde/xml/xtzsmall/import.xml"); |
142 | 321 | mdecorde | BaseParameters b = new BaseParameters(importParametersFile);
|
143 | 321 | mdecorde | b.getKeyValueParameters().put(ImportKeys.CLEAN, "false")
|
144 | 321 | mdecorde | b.getKeyValueParameters().put(ImportKeys.MULTITHREAD, "false")
|
145 | 321 | mdecorde | b.getKeyValueParameters().put(ImportKeys.DEBUG, "true")
|
146 | 321 | mdecorde | b.getKeyValueParameters().put(ImportKeys.UPDATECORPUS, "false")
|
147 | 321 | mdecorde | |
148 | 321 | mdecorde | XTZImport i = new XTZImport(b);
|
149 | 321 | mdecorde | i.start(); |
150 | 321 | mdecorde | |
151 | 321 | mdecorde | println "Done: "+i.isSuccessful
|
152 | 321 | mdecorde | |
153 | 321 | mdecorde | long end = System.currentTimeMillis() |
154 | 321 | mdecorde | println "TIME: "+(end-start)
|
155 | 321 | mdecorde | } |
156 | 321 | mdecorde | } |