Révision 2996
tmp/org.txm.searchengine.cqp.core/src/org/txm/searchengine/cqp/corpus/MainCorpus.java (revision 2996) | ||
---|---|---|
38 | 38 |
import java.util.Set; |
39 | 39 |
import java.util.UUID; |
40 | 40 |
|
41 |
import org.apache.commons.lang.StringUtils; |
|
41 | 42 |
import org.eclipse.core.runtime.IProgressMonitor; |
42 | 43 |
import org.eclipse.osgi.util.NLS; |
43 | 44 |
import org.txm.Toolbox; |
44 | 45 |
import org.txm.core.messages.TXMCoreMessages; |
45 | 46 |
import org.txm.core.results.TXMParameters; |
46 | 47 |
import org.txm.importer.cwb.PatchCwbRegistry; |
48 |
import org.txm.importer.cwb.ReadRegistryFile; |
|
47 | 49 |
import org.txm.objects.CorpusBuild; |
48 | 50 |
import org.txm.objects.Project; |
49 | 51 |
import org.txm.searchengine.core.messages.SearchEngineCoreMessages; |
... | ... | |
109 | 111 |
* |
110 | 112 |
* @param parametersNodePath the preference node path |
111 | 113 |
* |
112 |
* @throws InvalidCqpIdException the invalid cqp id exception
|
|
114 |
* @throws InvalidCqpIdException the invalid CQP id exception
|
|
113 | 115 |
* @throws CqiClientException the cqi client exception |
114 | 116 |
*/ |
115 | 117 |
public MainCorpus(String parametersNodePath) throws InvalidCqpIdException, CqiClientException { |
... | ... | |
117 | 119 |
|
118 | 120 |
if (getID() != null && getProjectDirectory() != null) { |
119 | 121 |
try { |
120 |
compute(false); |
|
122 |
if (!compute(false)) { |
|
123 |
Log.warning(NLS.bind("Warning: the {0} corpus won't work correctly", this.getID())); |
|
124 |
throw new IllegalStateException(NLS.bind("{0} CQP MainCorpus not instanciate correctly.", this.getID())); |
|
125 |
} |
|
121 | 126 |
} |
122 | 127 |
catch (InterruptedException e) { |
123 |
// TODO Auto-generated catch block |
|
124 |
e.printStackTrace(); |
|
128 |
throw new IllegalStateException(e); |
|
125 | 129 |
} |
126 | 130 |
} |
127 | 131 |
} |
... | ... | |
161 | 165 |
return false; |
162 | 166 |
} |
163 | 167 |
try { |
164 |
try { |
|
168 |
try { // fix the absolute paths in the registry file
|
|
165 | 169 |
PatchCwbRegistry.patch(this.registryFile, this.dataDirectory); |
166 | 170 |
} |
167 | 171 |
catch (IOException e) { |
... | ... | |
170 | 174 |
return false; |
171 | 175 |
} |
172 | 176 |
|
177 |
// check if all corpus index files are present |
|
178 |
ReadRegistryFile rrf = new ReadRegistryFile(this.registryFile); |
|
179 |
ArrayList<String> errors = rrf.isCorpusBuildValid(this.dataDirectory); |
|
180 |
if (errors.size() > 0) { |
|
181 |
Log.warning(TXMCoreMessages.bind("Error: some {0} index files are missing : {1}.", this.getID(), StringUtils.join(errors, ", "))); |
|
182 |
return false; |
|
183 |
} |
|
184 |
|
|
173 | 185 |
Log.fine(NLS.bind("Call CQI: load_a_system_corpus with {0} and {1}", this.registryFile.getParent(), this.pID)); |
174 | 186 |
CQPSearchEngine.getCqiClient().load_a_system_corpus(this.registryFile.getParent(), this.pID); |
175 | 187 |
|
176 |
List tmp = Arrays.asList(CQPSearchEngine.getCqiClient().listCorpora()); |
|
188 |
List<String> tmp = Arrays.asList(CQPSearchEngine.getCqiClient().listCorpora());
|
|
177 | 189 |
if (tmp.contains(this.pID)) { |
178 | 190 |
Log.fine("Corpus registered: " + pID); |
179 | 191 |
Log.fine(NLS.bind("Call CQI: corpusProperties with {0}.", this.pID)); |
tmp/org.txm.searchengine.cqp.core/src/org/txm/importer/cwb/ReadRegistryFile.java (revision 2996) | ||
---|---|---|
6 | 6 |
import java.util.HashSet; |
7 | 7 |
|
8 | 8 |
import org.txm.utils.io.IOUtils; |
9 |
import org.txm.utils.logger.Log; |
|
9 | 10 |
|
10 | 11 |
/** |
11 | 12 |
* Read a registry file a retrive the declared p attributes and s attributes informations. |
... | ... | |
13 | 14 |
* Call constructor then use : getPattributes and getSattributes for cwb-encode |
14 | 15 |
* |
15 | 16 |
* or use getSattributesMap, getSattributeProfs and getAnatypes() to get the declared attributes |
17 |
* |
|
16 | 18 |
* @author mdecorde |
17 | 19 |
* |
18 | 20 |
*/ |
19 | 21 |
public class ReadRegistryFile { |
22 |
|
|
20 | 23 |
File registryFile; |
24 |
|
|
21 | 25 |
ArrayList<String> pAttributes; |
26 |
|
|
22 | 27 |
ArrayList<String> sAttributes; |
28 |
|
|
23 | 29 |
HashMap<String, HashSet<String>> sattrs; |
30 |
|
|
24 | 31 |
HashMap<String, Integer> sattrsProfs; |
25 | 32 |
|
26 | 33 |
public ReadRegistryFile(File registryFile) { |
... | ... | |
34 | 41 |
public void read() { |
35 | 42 |
pAttributes = new ArrayList(); |
36 | 43 |
sAttributes = new ArrayList(); |
37 |
sattrs = new HashMap<String, HashSet<String>>();
|
|
38 |
sattrsProfs = new HashMap<String, Integer>();
|
|
44 |
sattrs = new HashMap<>(); |
|
45 |
sattrsProfs = new HashMap<>(); |
|
39 | 46 |
|
40 | 47 |
for (String line : IOUtils.getLines(registryFile, System.getProperty("file.encoding"))) { |
41 | 48 |
line = line.trim(); // remove first tab |
42 |
|
|
49 |
|
|
43 | 50 |
if (line.startsWith("ATTRIBUTE ")) { |
44 | 51 |
line = line.substring(10); // remove 'ATTRIBUTE ' |
45 | 52 |
pAttributes.add(line); |
46 |
} else if (line.startsWith("STRUCTURE ")) { |
|
53 |
} |
|
54 |
else if (line.startsWith("STRUCTURE ")) { |
|
47 | 55 |
line = line.substring(9); // remove 'STRUCTURE ' |
48 | 56 |
line = line.replaceAll("\\#.*", ""); |
49 | 57 |
line = line.trim(); |
50 | 58 |
String[] split = line.split("_", 2); |
51 | 59 |
String sname = split[0]; |
52 |
//println split |
|
60 |
// println split
|
|
53 | 61 |
if (split.length == 1) { // sattr decl |
54 |
if (sname.matches(".+[1-9]") && sattrs.containsKey(sname.substring(0, sname.length()-1))) { // recursive structure |
|
55 |
sname = sname.substring(0, sname.length()-1); |
|
56 |
sattrsProfs.put(sname, sattrsProfs.get(sname)+1); |
|
57 |
} else { |
|
62 |
if (sname.matches(".+[1-9]") && sattrs.containsKey(sname.substring(0, sname.length() - 1))) { // recursive structure |
|
63 |
sname = sname.substring(0, sname.length() - 1); |
|
64 |
sattrsProfs.put(sname, sattrsProfs.get(sname) + 1); |
|
65 |
} |
|
66 |
else { |
|
58 | 67 |
sattrs.put(sname, new HashSet<String>()); |
59 | 68 |
sattrsProfs.put(sname, 0); |
60 | 69 |
} |
61 |
} else { |
|
70 |
} |
|
71 |
else { |
|
62 | 72 |
String satt = split[1]; |
63 |
if (satt.matches(".+[1-9]") && sattrs.get(sname).contains(satt.substring(0, satt.length()-1))) {
|
|
73 |
if (satt.matches(".+[1-9]") && sattrs.get(sname).contains(satt.substring(0, satt.length() - 1))) {
|
|
64 | 74 |
// recursive attribute -> to be ignored |
65 |
} else { |
|
75 |
} |
|
76 |
else { |
|
66 | 77 |
sattrs.get(sname).add(satt); |
67 | 78 |
} |
68 | 79 |
} |
... | ... | |
70 | 81 |
} |
71 | 82 |
|
72 | 83 |
for (String sattr : sattrs.keySet()) { |
73 |
String tmp = ""+sattr+":"+sattrsProfs.get(sattr);
|
|
84 |
String tmp = "" + sattr + ":" + sattrsProfs.get(sattr);
|
|
74 | 85 |
for (String attr : sattrs.get(sattr)) { |
75 |
tmp += "+"+attr;
|
|
86 |
tmp += "+" + attr;
|
|
76 | 87 |
} |
77 | 88 |
sAttributes.add(tmp); |
78 | 89 |
} |
79 | 90 |
} |
80 | 91 |
|
81 | 92 |
/** |
93 |
* Test the CQP index files of each p-attribute and s-attribute properties |
|
82 | 94 |
* |
95 |
* @param dataDirectory the directory where the binary files should be found |
|
96 |
* @return true if all CQP files are present |
|
97 |
*/ |
|
98 |
public ArrayList<String> isCorpusBuildValid(File dataDirectory) { |
|
99 |
if (pAttributes == null) { |
|
100 |
read(); |
|
101 |
} |
|
102 |
|
|
103 |
ArrayList<String> errors = new ArrayList<>(); |
|
104 |
|
|
105 |
// test p-attributes |
|
106 |
String[] exts = { ".corpus", ".lexicon", ".corpus.cnt", ".corpus.rdx", ".corpus.rev", ".lexicon.idx", ".lexicon.srt" }; |
|
107 |
for (String p : pAttributes) { |
|
108 |
for (String ext : exts) { |
|
109 |
File f = new File(dataDirectory, p + ext); |
|
110 |
if (!f.exists()) { |
|
111 |
// System.out.println("MISSING: " + f.exists() + " " + f.getAbsolutePath()); |
|
112 |
errors.add(f.getName()); |
|
113 |
} |
|
114 |
} |
|
115 |
} |
|
116 |
|
|
117 |
String[] sexts = { ".rng" }; |
|
118 |
String[] spexts = { ".avs", ".avx", ".rng" }; |
|
119 |
for (String s : sattrs.keySet()) { |
|
120 |
for (String ext : sexts) { |
|
121 |
File f = new File(dataDirectory, s + ext); |
|
122 |
if (!f.exists()) { |
|
123 |
// System.out.println("MISSING: " + f.exists() + " " + f.getAbsolutePath()); |
|
124 |
errors.add(f.getName()); |
|
125 |
} |
|
126 |
} |
|
127 |
|
|
128 |
for (String sp : sattrs.get(s)) { |
|
129 |
for (String ext : spexts) { |
|
130 |
File f = new File(dataDirectory, s + "_" + sp + ext); |
|
131 |
if (!f.exists()) { |
|
132 |
// System.out.println("MISSING: " + f.exists() + " " + f.getAbsolutePath()); |
|
133 |
errors.add(f.getName()); |
|
134 |
} |
|
135 |
} |
|
136 |
} |
|
137 |
} |
|
138 |
return errors; |
|
139 |
} |
|
140 |
|
|
141 |
/** |
|
142 |
* |
|
83 | 143 |
* @return the cwb-encode arguments for p attributes |
84 | 144 |
*/ |
85 | 145 |
public ArrayList<String> getPAttributes() { |
... | ... | |
111 | 171 |
} |
112 | 172 |
|
113 | 173 |
public static void main(String[] args) { |
114 |
File registry = new File(System.getProperty("user.home"), "runtime-rcpapplication.product/corpora/ANNOTATIONCONC/registry/annotationconc"); |
|
174 |
File registry = new File(System.getProperty("user.home"), "runtime-rcpapplication.product/corpora/VOEUX/registry/voeux"); |
|
175 |
File data = new File(System.getProperty("user.home"), "runtime-rcpapplication.product/corpora/VOEUX/data/VOEUX"); |
|
115 | 176 |
ReadRegistryFile reader = new ReadRegistryFile(registry); |
116 |
System.out.println("pAttributes: "+reader.getPAttributes()); |
|
117 |
System.out.println("sAttributes Map: "+reader.getSAttributesMap()); |
|
118 |
System.out.println("sAttributes: "+reader.getSAttributes()); |
|
177 |
System.out.println("pAttributes: " + reader.getPAttributes()); |
|
178 |
System.out.println("sAttributes Map: " + reader.getSAttributesMap()); |
|
179 |
System.out.println("sAttributes: " + reader.getSAttributes()); |
|
180 |
|
|
181 |
System.out.println("Validation: " + reader.isCorpusBuildValid(data)); |
|
119 | 182 |
} |
120 | 183 |
} |
tmp/org.txm.core/src/java/org/txm/objects/Project.java (revision 2996) | ||
---|---|---|
567 | 567 |
result.setUserPersistable(true); |
568 | 568 |
} |
569 | 569 |
} |
570 |
catch (Exception e) {
|
|
571 |
// TODO Auto-generated catch block
|
|
572 |
e.printStackTrace();
|
|
570 |
catch (Throwable e) {
|
|
571 |
Log.warning("Internal error: could not initialize result: " + resultNodePath + ": " + e);
|
|
572 |
Log.printStackTrace(e);
|
|
573 | 573 |
} |
574 | 574 |
} |
575 | 575 |
if (errors.size() > 0) { |
tmp/org.txm.concordance.core/src/org/txm/concordance/core/functions/Concordance.java (revision 2996) | ||
---|---|---|
207 | 207 |
public boolean loadParameters() { |
208 | 208 |
|
209 | 209 |
try { |
210 |
|
|
210 |
if (!getCorpus().hasBeenComputedOnce()) { |
|
211 |
Log.warning("Error: CQP corpus is not available: " + getCorpus()); |
|
212 |
return false; |
|
213 |
} |
|
211 | 214 |
this.setQuery(this.getStringParameterValue(ConcordancePreferences.QUERY)); |
212 | 215 |
|
213 | 216 |
this.setCQLSeparator(this.getCorpus().getCQLLimitQuery()); |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2Transcriber.groovy (revision 2996) | ||
---|---|---|
1 |
package org.txm.macro.transcription |
|
2 |
|
|
3 |
import javax.xml.stream.* |
|
4 |
|
|
5 |
import org.txm.importer.PersonalNamespaceContext |
|
6 |
|
|
7 |
import java.io.BufferedOutputStream |
|
8 |
import java.io.FileOutputStream |
|
9 |
import java.net.URL |
|
10 |
import java.util.regex.Pattern |
|
11 |
|
|
12 |
class Vocapia2Transcriber { |
|
13 |
|
|
14 |
File xmlfile; |
|
15 |
protected BufferedOutputStream output; |
|
16 |
protected XMLStreamWriter writer; |
|
17 |
|
|
18 |
public Vocapia2Transcriber(File xmlfile) { |
|
19 |
|
|
20 |
this.xmlfile = xmlfile; |
|
21 |
} |
|
22 |
|
|
23 |
public boolean process(File outfile, boolean retokenizeWords) { |
|
24 |
|
|
25 |
if (!xmlfile.exists()) return false; |
|
26 |
|
|
27 |
output = new BufferedOutputStream(new FileOutputStream(outfile), 16 * 1024); |
|
28 |
writer = XMLOutputFactory.newInstance().createXMLStreamWriter(output, "ISO-8859-1");// create a new file |
|
29 |
writer.setNamespaceContext(new PersonalNamespaceContext()); |
|
30 |
|
|
31 |
URL url = xmlfile.toURI().toURL(); |
|
32 |
String filename = outfile.getName() |
|
33 |
filename = filename.substring(0, filename.length()-4); // remove ".cqp" |
|
34 |
def inputData = url.openStream(); |
|
35 |
XMLInputFactory factory = XMLInputFactory.newInstance(); |
|
36 |
XMLStreamReader parser = factory.createXMLStreamReader(inputData); |
|
37 |
|
|
38 |
boolean flagWord = false |
|
39 |
def winfos = [:] |
|
40 |
boolean other = false; |
|
41 |
String word = "" |
|
42 |
try { |
|
43 |
|
|
44 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
|
45 |
|
|
46 |
switch (event) { |
|
47 |
case XMLStreamConstants.START_ELEMENT: |
|
48 |
String localname = parser.getLocalName() |
|
49 |
switch(localname) { |
|
50 |
case "AudioDoc": // <AudioDoc name="xyz" path="xyz.flac"> -> <Trans scribe="see Proc elements" audio_filename="xyz.flac" version="see Proc elements" version_date="see Proc elements"> |
|
51 |
|
|
52 |
writer.writeStartDocument("ISO-8859-1", "1.0") |
|
53 |
|
|
54 |
writer.writeStartElement("Trans") |
|
55 |
writer.writeAttribute("audio_filename", parser.getAttributeValue(null, "path")) |
|
56 |
break |
|
57 |
|
|
58 |
case "Proc": // <Proc name="scribe" version="date" editor="AAA"/> |
|
59 |
//continue writing the "Trans" element |
|
60 |
if ("scribe" == parser.getAttributeValue(null, "name")) { |
|
61 |
writer.writeAttribute("scribe", parser.getAttributeValue(null, "editor")) |
|
62 |
writer.writeAttribute("version", parser.getAttributeValue(null, "version")) |
|
63 |
writer.writeAttribute("version_date", parser.getAttributeValue(null, "version")) |
|
64 |
} |
|
65 |
|
|
66 |
break; |
|
67 |
|
|
68 |
case "SpeakerList": // <SpeakerList> -> <Speakers> |
|
69 |
writer.writeCharacters("\n") // after <Trans> |
|
70 |
writer.writeStartElement("Speakers") |
|
71 |
writer.writeCharacters("\n") |
|
72 |
|
|
73 |
break; |
|
74 |
case "Speaker": // <Speaker ch="1" dur="531.38" gender="X" spkid="Enquêtrice" lang="fre" lconf="1.00" nw="1586" tconf="0.95"/> -> <Speaker id="spk1" name="enq4" check="no" dialect="native" accent="" scope="local"/> |
|
75 |
|
|
76 |
writer.writeStartElement("Speaker") |
|
77 |
writer.writeAttribute("id", parser.getAttributeValue(null, "spkid")) |
|
78 |
writer.writeAttribute("name", parser.getAttributeValue(null, "spkid")) |
|
79 |
writer.writeAttribute("check", "") |
|
80 |
writer.writeAttribute("dialect", parser.getAttributeValue(null, "lang")) |
|
81 |
writer.writeAttribute("accent", parser.getAttributeValue(null, "gender")) |
|
82 |
writer.writeAttribute("scope", "local") |
|
83 |
writer.writeEndElement() |
|
84 |
writer.writeCharacters("\n") |
|
85 |
break; |
|
86 |
|
|
87 |
case "SegmentList": |
|
88 |
writer.writeStartElement("Episode") |
|
89 |
//<Section type="report" startTime="0" endTime="3617.593"> |
|
90 |
writer.writeStartElement("Section") |
|
91 |
break; |
|
92 |
|
|
93 |
case "SpeechSegment": // <SpeechSegment ch="1" sconf="1.00" stime="9.94" etime="43.81" spkid="Enquêtrice" lang="fre" lconf="1.00" trs="1"> |
|
94 |
writer.writeStartElement("Turn") |
|
95 |
writer.writeAttribute("speaker", parser.getAttributeValue(null, "spkid")) |
|
96 |
writer.writeAttribute("startTime", parser.getAttributeValue(null, "stime")) |
|
97 |
writer.writeAttribute("endTime", parser.getAttributeValue(null, "etime")) |
|
98 |
writer.writeCharacters("\n") |
|
99 |
writer.writeStartElement("Sync") |
|
100 |
writer.writeAttribute("time", parser.getAttributeValue(null, "stime")) |
|
101 |
writer.writeEndElement() // Sync |
|
102 |
writer.writeCharacters("\n") |
|
103 |
break; |
|
104 |
case "Word": |
|
105 |
flagWord = true |
|
106 |
word = "" |
|
107 |
|
|
108 |
// store w infos in case the word must be splited |
|
109 |
def endValue = String.format(Locale.US, "%.2f", (Double.parseDouble(parser.getAttributeValue(null, "stime")) + Double.parseDouble(parser.getAttributeValue(null, "dur")))) |
|
110 |
winfos = ["time":parser.getAttributeValue(null, "stime"), "start": parser.getAttributeValue(null, "stime"), "end":endValue] |
|
111 |
break |
|
112 |
} |
|
113 |
break; |
|
114 |
case XMLStreamConstants.END_ELEMENT: |
|
115 |
String localname = parser.getLocalName() |
|
116 |
switch(localname) { |
|
117 |
case "AudioDoc": // <AudioDoc name="xyz" path="xyz.flac"> -> <Trans scribe="see Proc elements" audio_filename="xyz.flac" version="see Proc elements" version_date="see Proc elements"> |
|
118 |
|
|
119 |
writer.writeEndElement() // Trans |
|
120 |
break |
|
121 |
|
|
122 |
case "Proc": // <Proc name="scribe" version="date" editor="AAA"/> |
|
123 |
|
|
124 |
break |
|
125 |
|
|
126 |
case "SpeakerList": // <SpeakerList> -> <Speakers> |
|
127 |
|
|
128 |
writer.writeEndElement() |
|
129 |
writer.writeCharacters("\n") |
|
130 |
break |
|
131 |
|
|
132 |
case "Speaker": // <Speaker ch="1" dur="531.38" gender="X" spkid="Enquêtrice" lang="fre" lconf="1.00" nw="1586" tconf="0.95"/> -> <Speaker id="spk1" name="enq4" check="no" dialect="native" accent="" scope="local"/> |
|
133 |
//already closed |
|
134 |
break |
|
135 |
|
|
136 |
case "SegmentList": |
|
137 |
writer.writeEndElement() // Section |
|
138 |
writer.writeEndElement() // Episode |
|
139 |
writer.writeCharacters("\n") |
|
140 |
break |
|
141 |
|
|
142 |
case "SpeechSegment": // <SpeechSegment ch="1" sconf="1.00" stime="9.94" etime="43.81" spkid="Enquêtrice" lang="fre" lconf="1.00" trs="1"> |
|
143 |
writer.writeEndElement() // Turn |
|
144 |
writer.writeCharacters("\n") |
|
145 |
break |
|
146 |
|
|
147 |
case "Word": |
|
148 |
flagWord = false |
|
149 |
word = word.trim() |
|
150 |
if (word.startsWith("*")) { |
|
151 |
other = true |
|
152 |
word = word.substring(1) |
|
153 |
} |
|
154 |
|
|
155 |
String otherAttributeValue = Boolean.toString(other) // set now |
|
156 |
|
|
157 |
if (other && word.endsWith("*")) { |
|
158 |
word = word.substring(0, word.length()-1) |
|
159 |
other = false |
|
160 |
} |
|
161 |
|
|
162 |
// split before the word |
|
163 |
def puncts = [] |
|
164 |
if (retokenizeWords) { |
|
165 |
while (word.length() > 0 && word.matches("\\p{Punct}.+")) { |
|
166 |
puncts << word.substring(0, 1) |
|
167 |
word = word.substring(1, word.length()) |
|
168 |
} |
|
169 |
|
|
170 |
Pattern reg = Pattern.compile("([^']+')(.+)") |
|
171 |
def m = reg.matcher(word) |
|
172 |
while (word.length() > 0 && m.matches()) { |
|
173 |
puncts << m.group(1) |
|
174 |
word = m.group(2) |
|
175 |
m = reg.matcher(word) |
|
176 |
} |
|
177 |
} |
|
178 |
|
|
179 |
for (def punct : puncts) { // pre-retokenize if any |
|
180 |
writer.writeStartElement("w") |
|
181 |
writer.writeAttribute("time", winfos["time"]) |
|
182 |
writer.writeAttribute("start", winfos["start"]) |
|
183 |
writer.writeAttribute("end", winfos["start"]) |
|
184 |
writer.writeCharacters(punct) |
|
185 |
writer.writeEndElement() // w |
|
186 |
writer.writeCharacters("\n") |
|
187 |
|
|
188 |
|
|
189 |
} |
|
190 |
|
|
191 |
puncts = [] |
|
192 |
if (retokenizeWords) { |
|
193 |
while (word.length() > 0 && word.matches(".+\\p{Punct}")) { |
|
194 |
puncts << word.substring(word.length()-1, word.length()) |
|
195 |
word = word.substring(0, word.length()-1) |
|
196 |
} |
|
197 |
} |
|
198 |
|
|
199 |
writer.writeStartElement("w") // start the initial word |
|
200 |
writer.writeAttribute("time", winfos["time"]) |
|
201 |
writer.writeAttribute("start", winfos["start"]) |
|
202 |
writer.writeAttribute("end", winfos["end"]) |
|
203 |
writer.writeAttribute("other", otherAttributeValue) |
|
204 |
|
|
205 |
writer.writeCharacters(word) |
|
206 |
writer.writeEndElement() // w |
|
207 |
writer.writeCharacters("\n") |
|
208 |
|
|
209 |
for (String punct : puncts) { // post-retokenize if any |
|
210 |
writer.writeStartElement("w") |
|
211 |
writer.writeAttribute("time", winfos["time"]) |
|
212 |
writer.writeAttribute("start", winfos["end"]) |
|
213 |
writer.writeAttribute("end", winfos["end"]) |
|
214 |
writer.writeCharacters(punct) |
|
215 |
writer.writeEndElement() // w |
|
216 |
writer.writeCharacters("\n") |
|
217 |
} |
|
218 |
break |
|
219 |
} |
|
220 |
break |
|
221 |
|
|
222 |
case XMLStreamConstants.CHARACTERS: |
|
223 |
if (flagWord) { |
|
224 |
word += parser.getText() |
|
225 |
} |
|
226 |
break |
|
227 |
} |
|
228 |
} |
|
229 |
} catch (Exception e) { |
|
230 |
println "Error while processing XML File "+xmlfile+": " |
|
231 |
e.printStackTrace(); |
|
232 |
println "At: "+parser.getLocation(); |
|
233 |
println "See: "+outfile.getAbsolutePath() |
|
234 |
} |
|
235 |
|
|
236 |
output.flush() |
|
237 |
writer.close() |
|
238 |
output.close() |
|
239 |
//writer.close() |
|
240 |
parser.close() |
|
241 |
//println "$xmlfile -> $outfile" |
|
242 |
return true; |
|
243 |
} |
|
244 |
|
|
245 |
public static void main(String[] args) { |
|
246 |
File infile = new File("/home/mdecorde/xml/vocapia","test.xml") |
|
247 |
File outfile = new File("/home/mdecorde/xml/vocapia","test.trs") |
|
248 |
def processor = new Vocapia2Transcriber(infile) |
|
249 |
println processor.process(outfile) |
|
250 |
} |
|
251 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2TranscriberMacro.groovy (revision 2996) | ||
---|---|---|
1 |
package org.txm.macro.transcription |
|
2 |
|
|
3 |
import java.time.LocalTime |
|
4 |
import java.time.format.DateTimeFormatter |
|
5 |
import org.txm.utils.* |
|
6 |
import org.txm.utils.logger.* |
|
7 |
|
|
8 |
@Field @Option(name="vocapiaFile", usage="A single vocapia XML file", widget="FileOpen", required=false, def="") |
|
9 |
File vocapiaFile; |
|
10 |
|
|
11 |
@Field @Option(name="vocapiaDirectory", usage="A Vocapia XML files directory to process", widget="Folder", required=false, def="") |
|
12 |
File vocapiaDirectory; |
|
13 |
|
|
14 |
@Field @Option(name="resultDirectory", usage="The result directory", widget="Folder", required=false, def="") |
|
15 |
File resultDirectory; |
|
16 |
|
|
17 |
@Field @Option(name="retokenize_words", usage="retokenize words prefixed or postfixed with puunctuations", widget="Boolean", required=true, def="true") |
|
18 |
Boolean retokenize_words; |
|
19 |
|
|
20 |
if (!ParametersDialog.open(this)) return; |
|
21 |
|
|
22 |
resultDirectory.mkdirs(); |
|
23 |
|
|
24 |
def xmlFiles = [] |
|
25 |
if (vocapiaDirectory != null && vocapiaDirectory.exists()) { |
|
26 |
|
|
27 |
println "Processing directory: $vocapiaDirectory" |
|
28 |
for (File file : vocapiaDirectory.listFiles()) { |
|
29 |
if (file.getName().toLowerCase().endsWith(".xml")) { |
|
30 |
xmlFiles << file |
|
31 |
} |
|
32 |
} |
|
33 |
} else if (vocapiaFile != null && vocapiaFile.exists()) { |
|
34 |
println "Processing file: $vocapiaFile" |
|
35 |
xmlFiles << vocapiaFile |
|
36 |
} |
|
37 |
|
|
38 |
if (xmlFiles.size() == 0) { |
|
39 |
println "No XML file found for parameters vocapiaFile=$vocapiaFile and vocapiaDirectory=$vocapiaDirectory" |
|
40 |
return false |
|
41 |
} |
|
42 |
|
|
43 |
ConsoleProgressBar cpb = new ConsoleProgressBar(xmlFiles.size()) |
|
44 |
for (File xmlFile : xmlFiles) { |
|
45 |
cpb.tick() |
|
46 |
Vocapia2Transcriber v2t = new Vocapia2Transcriber(xmlFile) |
|
47 |
String name = FileUtils.stripExtension(xmlFile) |
|
48 |
File outFile = new File(resultDirectory, name+".trs") |
|
49 |
|
|
50 |
if (!v2t.process(outFile, retokenize_words)) { |
|
51 |
println "WARNING: ERROR WHILE PROCESSING: "+xmlFile |
|
52 |
return false |
|
53 |
} |
|
54 |
} |
|
55 |
cpb.done() |
|
56 |
|
|
57 |
println "Done: "+xmlFiles.size()+" files processed. Result files in $resultDirectory" |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/prototypes/stats/InvertCAXAxisMacro.groovy (revision 2996) | ||
---|---|---|
1 |
// @author Sebastien Jacquot |
|
2 |
// STANDARD DECLARATIONS |
|
3 |
package org.txm.macro |
|
4 |
|
|
5 |
|
|
6 |
import org.kohsuke.args4j.* |
|
7 |
import groovy.transform.Field |
|
8 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
9 |
import org.txm.searchengine.cqp.clientExceptions.* |
|
10 |
import org.txm.searchengine.cqp.corpus.* |
|
11 |
import org.txm.searchengine.cqp.corpus.query.* |
|
12 |
import org.apache.commons.lang.time.StopWatch |
|
13 |
import java.util.Arrays |
|
14 |
import org.jfree.chart.renderer.xy.* |
|
15 |
import org.jfree.chart.renderer.* |
|
16 |
import org.jfree.chart.plot.* |
|
17 |
import org.jfree.data.xy.* |
|
18 |
import org.jfree.chart.axis.* |
|
19 |
import java.awt.*; |
|
20 |
import java.awt.geom.*; |
|
21 |
import org.jfree.chart.labels.* |
|
22 |
|
|
23 |
import org.txm.ca.core.chartsengine.jfreechart.themes.highcharts.renderers.* |
|
24 |
import org.txm.ca.rcp.editors.* |
|
25 |
import org.txm.libs.office.ReadODS |
|
26 |
import org.txm.ca.core.chartsengine.jfreechart.datasets.* |
|
27 |
import org.jfree.chart.renderer.AbstractRenderer |
|
28 |
|
|
29 |
println "editor: "+editor |
|
30 |
|
|
31 |
if (!(editor instanceof CAEditor)) { |
|
32 |
println "editor is not a CA editor: $editor, Run the macro with F12 when the editor is selected :-)" |
|
33 |
return |
|
34 |
} |
|
35 |
|
|
36 |
ica = editor.getCA(); |
|
37 |
chart = ica.getChart(); |
|
38 |
plot = chart.getXYPlot(); |
|
39 |
dataset = plot.getDataset(); |
|
40 |
|
|
41 |
// overrides some dataset methods to return inverted X coordinates for columns and rows |
|
42 |
plot.setDataset(new CAXYDataset(ica) { |
|
43 |
|
|
44 |
public Number getX(int series, int item) { |
|
45 |
if(item == -1) { |
|
46 |
System.out.println("CAXYDataset.getX()"); |
|
47 |
} |
|
48 |
// Rows |
|
49 |
if(series == 0) { |
|
50 |
return -this.rowCoordinates[item][this.axis1]; |
|
51 |
} |
|
52 |
// Cols |
|
53 |
else { |
|
54 |
return -this.columnCoordinates[item][this.axis1]; |
|
55 |
} |
|
56 |
} |
|
57 |
|
|
58 |
|
|
59 |
|
|
60 |
/** |
|
61 |
* Gets the minimum value in the specified series according to the specified axis. |
|
62 |
* @param series |
|
63 |
* @param axis |
|
64 |
* @return |
|
65 |
*/ |
|
66 |
public double getMinValue(int series, int axis) { |
|
67 |
double minValue = 0; |
|
68 |
double tmpMinValue; |
|
69 |
double[][] coordinates = this.rowCoordinates; |
|
70 |
if(series != 0) { |
|
71 |
coordinates = this.columnCoordinates; |
|
72 |
} |
|
73 |
|
|
74 |
for(int i = 0; i < coordinates.length; i++) { |
|
75 |
tmpMinValue = coordinates[i][axis]; |
|
76 |
|
|
77 |
// invert X coordinate |
|
78 |
if(axis == 0) { |
|
79 |
tmpMinValue = -tmpMinValue; |
|
80 |
} |
|
81 |
|
|
82 |
if(tmpMinValue < minValue) { |
|
83 |
minValue = tmpMinValue; |
|
84 |
} |
|
85 |
} |
|
86 |
|
|
87 |
return minValue; |
|
88 |
} |
|
89 |
|
|
90 |
/** |
|
91 |
* Gets the maximum value in the specified series according to the specified axis. |
|
92 |
* @param series |
|
93 |
* @param axis |
|
94 |
* @return |
|
95 |
*/ |
|
96 |
public double getMaxValue(int series, int axis) { |
|
97 |
double maxValue = 0; |
|
98 |
double tmpMaxValue; |
|
99 |
double[][] coordinates = this.rowCoordinates; |
|
100 |
if(series != 0) { |
|
101 |
coordinates = this.columnCoordinates; |
|
102 |
} |
|
103 |
|
|
104 |
for(int i = 0; i < coordinates.length; i++) { |
|
105 |
tmpMaxValue = coordinates[i][axis]; |
|
106 |
|
|
107 |
// invert X coordinate |
|
108 |
if(axis == 0) { |
|
109 |
tmpMaxValue = -tmpMaxValue; |
|
110 |
} |
|
111 |
|
|
112 |
if(tmpMaxValue > maxValue) { |
|
113 |
maxValue = tmpMaxValue; |
|
114 |
} |
|
115 |
} |
|
116 |
|
|
117 |
return maxValue; |
|
118 |
} |
|
119 |
} |
|
120 |
); |
|
121 |
|
|
122 |
// update the limits dotted borders |
|
123 |
ica.getChartCreator().createCAFactorialMapChartLimitsBorder(chart); |
|
124 |
|
|
125 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/TRSToTEI.groovy (revision 2996) | ||
---|---|---|
163 | 163 |
} |
164 | 164 |
|
165 | 165 |
|
166 |
boolean ignoreFirstSync = false // need to skip fist Sync when multiple locutors in Turn
|
|
166 |
boolean overlapingTurn = false // need to skip fist Sync when multiple locutors in Turn
|
|
167 | 167 |
/** |
168 | 168 |
* Process. |
169 | 169 |
* |
... | ... | |
182 | 182 |
case "Turn": // >> sp |
183 | 183 |
testCloseU(); |
184 | 184 |
vSpeaker = parser.getAttributeValue(null, "speaker"); |
185 |
overlapingTurn = false |
|
185 | 186 |
if (vSpeaker == null) { vSpeaker="N/A" // no spk |
186 | 187 |
} else { |
187 |
localspeakers = vSpeaker.split(" ") |
|
188 |
if (localspeakers.size() == 0) { // only one speaker |
|
189 |
//println "FOUND ONE SPEAKER" |
|
190 |
if (speakersname.containsKey(vSpeaker)) { |
|
191 |
vSpeaker = speakersname.get(vSpeaker); |
|
188 |
|
|
189 |
if (speakersname.containsKey(vSpeaker)) { |
|
190 |
//vSpeaker = speakersname.get(vSpeaker); |
|
191 |
} else { |
|
192 |
localspeakers = vSpeaker.split(" ") |
|
193 |
if (localspeakers.size() > 1) { // only one speaker |
|
194 |
overlapingTurn = true |
|
192 | 195 |
} |
193 | 196 |
} |
194 | 197 |
} |
195 | 198 |
|
196 | 199 |
writer.writeStartElement("sp") |
197 | 200 |
writer.writeAttribute("n", Integer.toString(idturn++)) |
198 |
ignoreFirstSync = vSpeaker.contains(" ") // need to skip fist Sync when multiple locutors in Turn
|
|
199 |
writer.writeAttribute("overlap", ""+ignoreFirstSync)
|
|
201 |
overlapingTurn = vSpeaker.contains(" ") // need to skip fist Sync when multiple locutors in Turn
|
|
202 |
writer.writeAttribute("overlap", ""+overlapingTurn)
|
|
200 | 203 |
|
201 | 204 |
String time = parser.getAttributeValue(null, "startTime"); |
202 | 205 |
formatedTime = formatTime(time) |
... | ... | |
217 | 220 |
case "Sync": // >> u |
218 | 221 |
lastTime = parser.getAttributeValue(null, "time") |
219 | 222 |
testCloseU(); |
220 |
if (ignoreFirstSync) { // need to skip fist Sync when multiple locutors in Turn |
|
221 |
ignoreFirstSync = false; |
|
222 |
} else { |
|
223 |
// if (overlapingTurn) { // need to skip fist Sync when multiple locutors in Turn |
|
224 |
// overlapingTurn = false; |
|
225 |
// } else { |
|
226 |
// |
|
227 |
// } |
|
228 |
if (!overlapingTurn) { |
|
223 | 229 |
writeU() |
224 | 230 |
} |
225 | 231 |
break; |
... | ... | |
279 | 285 |
private testCloseU() { |
280 | 286 |
if (uOpened) { |
281 | 287 |
super.processEndElement(); // u |
288 |
writer.writeCharacters("\n") |
|
282 | 289 |
uOpened = false; |
283 | 290 |
} |
284 | 291 |
} |
... | ... | |
295 | 302 |
// println "getting spk name? ="+speakers.get(vSpeaker) |
296 | 303 |
// println "speakers: $speakers" |
297 | 304 |
def attributes = speakers.get(vSpeaker) |
305 |
//println "ATTRIBUTES="+attributes+" vSpeaker='$vSpeaker'" |
|
298 | 306 |
if (attributes == null) { // in case of Who@n wrong number |
299 |
if (vSpeaker.startsWith("#") && vSpeaker.endsWith("?")) { // don't show "N/A" vSpeaker |
|
307 |
// if (vSpeaker.startsWith("#") && vSpeaker.endsWith("?")) { // don't show "N/A" vSpeaker
|
|
300 | 308 |
writer.writeAttribute("who", vSpeaker) |
301 | 309 |
writer.writeAttribute("spkid", vSpeaker) |
302 |
writeAttributes(); |
|
303 |
} |
|
310 |
//writeAttributes(); |
|
311 |
// } else { |
|
312 |
// |
|
313 |
// } |
|
304 | 314 |
} else { |
305 | 315 |
for (Pair p : attributes) { |
306 | 316 |
// println " write attribute "+p.getFirst()+" "+p.getSecond() |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/pager.groovy (revision 2996) | ||
---|---|---|
424 | 424 |
def g = l2[i] |
425 | 425 |
metadata[m] = "" // forcing order of metadata by pre-declaring |
426 | 426 |
|
427 |
if (!metadataGroups.containsKey(g)) metadataGroups[g] = [] |
|
428 |
|
|
427 |
if (!metadataGroups.containsKey(g)) { |
|
428 |
metadataGroups[g] = [] |
|
429 |
} |
|
429 | 430 |
metadataGroups[g] << m // declaring a metadata type |
430 | 431 |
} |
431 | 432 |
metadataDeclared = true |
432 | 433 |
} |
433 | 434 |
|
434 |
//store attributes values in HashMap
|
|
435 |
//store attributes values in HashMap |
|
435 | 436 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
436 | 437 |
String name = parser.getAttributeLocalName(i) |
437 | 438 |
if (!"type".equals(name) |
... | ... | |
441 | 442 |
&& !"startTime".equals(name) |
442 | 443 |
&& !"endTime".equals(name)) { |
443 | 444 |
metadata[name] = parser.getAttributeValue(i) |
444 |
|
|
445 |
|
|
445 | 446 |
if (!metadataDeclared && !metadataGroups["metadata"].contains(name)) { |
446 | 447 |
metadataGroups["metadata"] << name |
447 | 448 |
} |
448 | 449 |
} |
449 | 450 |
} |
450 | 451 |
|
451 |
// write metadata HTML
|
|
452 |
// write metadata HTML |
|
452 | 453 |
if (metadataGroups.keySet().size() > 0) { |
453 | 454 |
writer.writeStartElement("p") |
454 | 455 |
writer.writeAttribute("class", "section-all-metadata"); |
... | ... | |
497 | 498 |
writer.writeAttribute("class", "turn"); |
498 | 499 |
|
499 | 500 |
overlapping = ("true" == parser.getAttributeValue(null,"overlap")) |
500 |
String spid = parser.getAttributeValue(null,"speaker");
|
|
501 |
String spid = parser.getAttributeValue(null, "who");
|
|
501 | 502 |
|
502 | 503 |
whos = [] |
503 | 504 |
if (overlapping) { |
504 |
writer.writeEmptyElement("br");
|
|
505 |
writeSpeaker(parser.getAttributeValue(null,"speaker"), false)
|
|
505 |
//writer.writeEmptyElement("br"); // write all overlaping speakers
|
|
506 |
//writeSpeaker(""+parser.getAttributeValue(null, "who"), false)
|
|
506 | 507 |
|
507 | 508 |
writer.writeEmptyElement("br"); |
508 | 509 |
whos = spid.split(" ") |
... | ... | |
521 | 522 |
writer.writeEmptyElement("br"); |
522 | 523 |
} |
523 | 524 |
|
524 |
String spk = parser.getAttributeValue(null, "spk")
|
|
525 |
String spk = parser.getAttributeValue(null, "who")
|
|
525 | 526 |
if (spk != null && spk != previousSPK) { |
526 | 527 |
endBoldIfNeeded() |
527 | 528 |
writer.writeEmptyElement("br"); |
528 |
writeSpeaker(parser.getAttributeValue(null, "spk"), overlapping)
|
|
529 |
writeSpeaker(parser.getAttributeValue(null, "who"), overlapping)
|
|
529 | 530 |
startBoldIfNeeded() |
530 | 531 |
} |
531 | 532 |
|
532 | 533 |
writeCurrentTime() |
533 | 534 |
previousSPK = spk |
535 |
if (overlapping) previousSPK = null |
|
534 | 536 |
|
535 | 537 |
// writenLength = 0; |
536 | 538 |
/*writer.writeStartElement("span"); |
... | ... | |
546 | 548 |
desc = translateEvent(desc); |
547 | 549 |
String type = parser.getAttributeValue(null,"type"); |
548 | 550 |
if (desc.equals("paroles rapportées")) { |
549 |
if (parser.getAttributeValue(null, "extent") == "end") |
|
551 |
if (parser.getAttributeValue(null, "extent") == "end") {
|
|
550 | 552 |
writer.writeCharacters("» "); |
551 |
else if (parser.getAttributeValue(null, "extent") == "begin") |
|
553 |
} |
|
554 |
else if (parser.getAttributeValue(null, "extent") == "begin") { |
|
552 | 555 |
writer.writeCharacters(" «"); |
556 |
} |
|
553 | 557 |
} else { |
554 | 558 |
writer.writeStartElement("span"); |
555 | 559 |
writer.writeAttribute("class", "event"); |
... | ... | |
564 | 568 |
events.add(desc) |
565 | 569 |
} |
566 | 570 |
else if (parser.getAttributeValue(null, "extent") == "previous") { |
567 |
if(parser.getAttributeValue(null, "type") == "pronounce") |
|
571 |
if (parser.getAttributeValue(null, "type") == "pronounce")
|
|
568 | 572 |
writer.writeCharacters("_["+desc+"] "); |
569 | 573 |
else |
570 | 574 |
writer.writeCharacters("_["+desc+"] "); |
... | ... | |
574 | 578 |
writer.writeCharacters(" ["+desc+"]_"); |
575 | 579 |
nextEvent = desc |
576 | 580 |
} |
577 |
else |
|
581 |
else {
|
|
578 | 582 |
writer.writeCharacters(" ["+desc+"] "); |
583 |
} |
|
579 | 584 |
writer.writeEndElement(); // span@class=event |
580 | 585 |
} |
581 | 586 |
break; |
... | ... | |
692 | 697 |
if(l > 0) |
693 | 698 |
endOfLastWord = lastword.subSequence(l-1, l); |
694 | 699 |
|
695 |
if(interpvalue != null)
|
|
700 |
if (interpvalue != null) {
|
|
696 | 701 |
interpvalue = interpvalue.replace("\"","""); |
697 |
if(events.size() > 0) |
|
702 |
} |
|
703 |
if (events.size() > 0) { |
|
698 | 704 |
interpvalue = interpvalue.replace("event=", "event="+events.toString().replace("\"",""")); // remove ", " |
699 |
|
|
700 |
if(nextEvent.length() > 0) |
|
701 |
{ |
|
705 |
} |
|
706 |
if (nextEvent.length() > 0) { |
|
702 | 707 |
interpvalue = interpvalue.replace("event=", "event="+nextEvent+", ") |
703 | 708 |
nextEvent = "" |
704 | 709 |
} |
... | ... | |
709 | 714 |
// println "NoSpaceAfter: "+NoSpaceAfter+" contains ? "+lastword |
710 | 715 |
// println "wordvalue starts with '-' ? "+wordvalue |
711 | 716 |
// println "NoSpaceAfter: "+NoSpaceAfter+" contains endOfLastWord ? "+endOfLastWord |
712 |
if(NoSpaceBefore.contains(wordvalue) || |
|
717 |
if (NoSpaceBefore.contains(wordvalue) ||
|
|
713 | 718 |
NoSpaceAfter.contains(lastword) || |
714 | 719 |
wordvalue.startsWith("-") || |
715 | 720 |
NoSpaceAfter.contains(endOfLastWord)) { |
... | ... | |
755 | 760 |
break; |
756 | 761 |
|
757 | 762 |
case XMLStreamConstants.CHARACTERS: |
758 |
if(flagform)
|
|
759 |
if(parser.getText().length() > 0)
|
|
763 |
if (flagform) {
|
|
764 |
if (parser.getText().length() > 0) {
|
|
760 | 765 |
wordvalue+=(parser.getText().trim()); |
761 |
if(flaginterp) |
|
762 |
if(parser.getText().length() > 0) |
|
766 |
} |
|
767 |
} |
|
768 |
if (flaginterp) { |
|
769 |
if (parser.getText().length() > 0) { |
|
763 | 770 |
interpvalue+=(parser.getText().trim()); |
771 |
} |
|
772 |
} |
|
764 | 773 |
break; |
765 | 774 |
} |
766 | 775 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 2996) | ||
---|---|---|
290 | 290 |
public boolean process() { |
291 | 291 |
|
292 | 292 |
try { |
293 |
def anaValues = [:] |
|
294 |
def anaType = "" |
|
295 |
def anaResp = "" |
|
296 |
def anaValue = new StringBuilder() |
|
297 |
|
|
293 | 298 |
boolean flagNote = false; |
294 | 299 |
String noteContent = ""; |
295 | 300 |
String rend = "" |
... | ... | |
446 | 451 |
break; |
447 | 452 |
case wordTag: |
448 | 453 |
wordid = getAttributeValue(parser, null,"id"); |
449 |
|
|
454 |
anaValues.clear() |
|
450 | 455 |
wordcount++; |
451 | 456 |
if (wordcount >= wordmax) { |
452 | 457 |
createNextOutput(); |
... | ... | |
460 | 465 |
break; |
461 | 466 |
case "ana": |
462 | 467 |
flaginterp=true; |
463 |
interpvalue+=" "+getAttributeValue(parser, null, "type").substring(1)+":" |
|
468 |
anaType = getAttributeValue(parser, null, "type").substring(1) |
|
469 |
anaResp = getAttributeValue(parser, null, "resp").substring(1) |
|
470 |
anaValue.setLength(0) |
|
464 | 471 |
break; |
465 | 472 |
case "form": |
466 | 473 |
wordvalue="" |
467 |
interpvalue ="" |
|
468 | 474 |
flagform=true |
469 | 475 |
break; |
470 | 476 |
default: |
... | ... | |
540 | 546 |
break; |
541 | 547 |
case "ana": |
542 | 548 |
flaginterp = false |
549 |
if (anaValues[anaType] == null || "src".equals(anaResp)) { |
|
550 |
anaValues[anaType] = anaValue.toString().trim() |
|
551 |
} |
|
543 | 552 |
break; |
544 | 553 |
case wordTag: |
545 | 554 |
int l = lastword.length(); |
... | ... | |
548 | 557 |
endOfLastWord = lastword.subSequence(l-1, l) |
549 | 558 |
} |
550 | 559 |
|
551 |
if (interpvalue != null) { |
|
552 |
interpvalue = interpvalue |
|
553 |
} |
|
560 |
String interpvalue = anaValues.entrySet().join(", ") |
|
561 |
|
|
554 | 562 |
if (NoSpaceBefore.contains(wordvalue) || |
555 | 563 |
NoSpaceAfter.contains(lastword) || |
556 | 564 |
wordvalue.startsWith("-") || |
... | ... | |
591 | 599 |
noteContent += parser.getText().replace("\n", " ") |
592 | 600 |
} |
593 | 601 |
} else if (flaginterp && parser.getText().length() > 0) { |
594 |
interpvalue+=(parser.getText())
|
|
602 |
anaValue.append(parser.getText())
|
|
595 | 603 |
} else if (flagNote == parser.getText().length() > 0) { |
596 | 604 |
noteContent += parser.getText().replace("\n", " ") |
597 | 605 |
} else if (writeOutOfTextToEditText) { |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xml/pager.groovy (revision 2996) | ||
---|---|---|
72 | 72 |
/** The wordvalue. */ |
73 | 73 |
String wordvalue = ""; |
74 | 74 |
|
75 |
/** The interpvalue. */ |
|
76 |
String interpvalue = ""; |
|
77 |
|
|
78 |
/** The lastword. */ |
|
75 |
/** The lastword. */ |
|
79 | 76 |
String lastword = " "; |
80 | 77 |
|
81 | 78 |
/** The wordtype. */ |
... | ... | |
282 | 279 |
|
283 | 280 |
String localname = ""; |
284 | 281 |
createNextOutput(); |
282 |
def anaValues = [:] |
|
283 |
def anaType = "" |
|
284 |
def anaResp = "" |
|
285 |
def anaValue = new StringBuilder() |
|
285 | 286 |
for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
286 | 287 |
switch (event) { |
287 | 288 |
case XMLStreamConstants.START_ELEMENT: |
... | ... | |
352 | 353 |
case "w": |
353 | 354 |
|
354 | 355 |
wordid = parser.getAttributeValue(null,"id"); |
355 |
|
|
356 |
anaValues.clear() |
|
356 | 357 |
wordcount++; |
357 | 358 |
if (wordcount >= wordmax) { |
358 | 359 |
createNextOutput(); |
... | ... | |
366 | 367 |
break; |
367 | 368 |
case "ana": |
368 | 369 |
flaginterp=true; |
369 |
interpvalue+=" "+parser.getAttributeValue(null,"type").substring(1)+":" |
|
370 |
anaType = parser.getAttributeValue(null, "type").substring(1) |
|
371 |
anaResp = parser.getAttributeValue(null, "resp").substring(1) |
|
372 |
anaValue.setLength(0) |
|
370 | 373 |
break; |
371 | 374 |
case "form": |
372 | 375 |
wordvalue=""; |
373 |
interpvalue =""; |
|
374 | 376 |
flagform=true; |
375 | 377 |
break; |
376 | 378 |
// default: |
... | ... | |
406 | 408 |
break; |
407 | 409 |
case "ana": |
408 | 410 |
flaginterp = false |
411 |
if (anaValues[anaType] == null || "src".equals(anaResp)) { |
|
412 |
anaValues[anaType] = anaValue.toString().trim() |
|
413 |
} |
|
409 | 414 |
break; |
410 | 415 |
case "w": |
411 | 416 |
int l = lastword.length(); |
... | ... | |
413 | 418 |
if (l > 0) |
414 | 419 |
endOfLastWord = lastword.subSequence(l-1, l); |
415 | 420 |
|
416 |
if (interpvalue != null) |
|
417 |
interpvalue = interpvalue; |
|
418 |
|
|
421 |
String interpvalue = anaValues.entrySet().join(", ") |
|
422 |
|
|
419 | 423 |
if (NoSpaceBefore.contains(wordvalue) || |
420 | 424 |
NoSpaceAfter.contains(lastword) || |
421 | 425 |
wordvalue.startsWith("-") || |
... | ... | |
441 | 445 |
if (flagNote == parser.getText().length() > 0) |
442 | 446 |
noteContent += parser.getText().replace("\n", " "); |
443 | 447 |
} else if (flaginterp && parser.getText().length() > 0) { |
444 |
interpvalue+=(parser.getText());
|
|
448 |
anaValue.append(parser.getText());
|
|
445 | 449 |
} else if (flagNote == parser.getText().length() > 0) { |
446 | 450 |
noteContent += parser.getText().replace("\n", " "); |
447 | 451 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xml/importer.groovy (revision 2996) | ||
---|---|---|
34 | 34 |
|
35 | 35 |
import org.txm.* |
36 | 36 |
import org.txm.scripts.importer.* |
37 |
import org.txm.utils.io.FileCopy |
|
37 | 38 |
import org.txm.importer.scripts.filters.* |
38 | 39 |
import org.txm.objects.* |
39 | 40 |
import org.txm.scripts.* |
... | ... | |
144 | 145 |
srcDirectory.mkdir() |
145 | 146 |
for (File f : okfiles) { |
146 | 147 |
File outputFile = new File (srcDirectory, f.getName()) |
148 |
// println "TEMP REMOVED SURROGATE FIX" |
|
149 |
// FileCopy.copy(f, outputFile) |
|
147 | 150 |
CleanFile.removeSurrogateFromXmlFile(f, outputFile) |
148 | 151 |
} |
149 | 152 |
okfiles = srcDirectory.listFiles() |
Formats disponibles : Unified diff