Révision 3466
TXM/trunk/org.txm.searchengine.cqp.core/src/org/txm/importer/cwb/CompressCQPIndexes.java (revision 3466) | ||
---|---|---|
87 | 87 |
|
88 | 88 |
ArrayList<String> args = new ArrayList<>(Arrays.asList(huff.getAbsolutePath(), "-T", "-r", registryfile.getParent())); |
89 | 89 |
|
90 |
// ReadRegistryFile rrf = new ReadRegistryFile(registryfile); |
|
91 |
// rrf.read(); |
|
92 |
// for (String p : rrf.pAttributes) { |
|
93 |
// args.add("-P"); |
|
94 |
// args.add(p); |
|
95 |
// } |
|
90 |
// ReadRegistryFile rrf = new ReadRegistryFile(registryfile);
|
|
91 |
// rrf.read();
|
|
92 |
// for (String p : rrf.pAttributes) {
|
|
93 |
// args.add("-P");
|
|
94 |
// args.add(p);
|
|
95 |
// }
|
|
96 | 96 |
args.add("-A"); |
97 | 97 |
args.add(corpusid); |
98 | 98 |
|
... | ... | |
108 | 108 |
|
109 | 109 |
ArrayList<String> args2 = new ArrayList<>(Arrays.asList(rdxcompressor.getAbsolutePath(), "-T", "-r", registryfile.getParent())); |
110 | 110 |
|
111 |
// for (String p : rrf.pAttributes) { |
|
112 |
// File f = new File(dataDirectory, p+".corpus"); |
|
113 |
// if (f.length() > 0) { |
|
114 |
// args2.add("-P"); |
|
115 |
// args2.add(p); |
|
116 |
// } |
|
117 |
// } |
|
111 |
// for (String p : rrf.pAttributes) {
|
|
112 |
// File f = new File(dataDirectory, p+".corpus");
|
|
113 |
// if (f.length() > 0) {
|
|
114 |
// args2.add("-P");
|
|
115 |
// args2.add(p);
|
|
116 |
// }
|
|
117 |
// }
|
|
118 | 118 |
args2.add("-A"); |
119 | 119 |
args2.add(corpusid); |
120 | 120 |
|
... | ... | |
128 | 128 |
return false; |
129 | 129 |
} |
130 | 130 |
|
131 |
// remove .corpus .corpus.rdx and corpus.rev files
|
|
131 |
// remove .corpus files if the compression was succesful
|
|
132 | 132 |
int s = 0; |
133 | 133 |
int a = 0; |
134 | 134 |
for (File f : dataDirectory.listFiles()) { |
135 |
if (f.getName().endsWith(".corpus") || f.getName().endsWith(".corpus.rdx") ||f.getName().endsWith(".corpus.rev")) { |
|
136 |
s += f.length(); |
|
137 |
f.delete(); |
|
138 |
if (txm081fix) f.createNewFile(); |
|
135 |
if (f.getName().endsWith(".corpus")) { |
|
136 |
|
|
137 |
|
|
138 |
String path = f.getAbsolutePath(); |
|
139 |
path = path.substring(0, path.length() - 7)+".huf"; |
|
140 |
File cfile = new File(path); |
|
141 |
if (cfile.exists()) { |
|
142 |
s += f.length(); |
|
143 |
f.delete(); |
|
144 |
if (txm081fix) f.createNewFile(); |
|
145 |
} else { |
|
146 |
continue; |
|
147 |
} |
|
139 | 148 |
} |
140 |
if (f.getName().matches(".+(\\.hcd|\\.huf|\\.huf\\.syn|\\.crc|\\.crx)")) {
|
|
149 |
if (f.getName().matches(".+(\\.hcd|\\.huf|\\.huf\\.syn)")) { |
|
141 | 150 |
a += f.length(); |
142 | 151 |
} |
143 | 152 |
} |
144 | 153 |
|
154 |
// remove .corpus.rdx and corpus.rev files if the compression was succesful |
|
155 |
for (File f : dataDirectory.listFiles()) { |
|
156 |
if (f.getName().endsWith(".corpus.rdx") ||f.getName().endsWith(".corpus.rev")) { |
|
157 |
|
|
158 |
String path = f.getAbsolutePath(); |
|
159 |
path = path.substring(0, path.length() - 7)+".crc"; |
|
160 |
File cfile = new File(path); |
|
161 |
if (cfile.exists()) { |
|
162 |
s += f.length(); |
|
163 |
f.delete(); |
|
164 |
if (txm081fix) f.createNewFile(); |
|
165 |
} else { |
|
166 |
continue; |
|
167 |
} |
|
168 |
} |
|
169 |
if (f.getName().matches(".+(\\.crc|\\.crx)")) { |
|
170 |
a += f.length(); |
|
171 |
} |
|
172 |
} |
|
173 |
|
|
145 | 174 |
System.out.println("cleared: "+s); |
146 | 175 |
System.out.println("created: "+a); |
147 | 176 |
System.out.println("diff="+(s-a)); |
148 | 177 |
return true; |
149 | 178 |
} |
150 |
|
|
179 |
|
|
151 | 180 |
/** |
152 | 181 |
* The main method. |
153 | 182 |
* |
... | ... | |
156 | 185 |
public static void main(String[] args) { |
157 | 186 |
try { |
158 | 187 |
String userdir = System.getProperty("user.home"); |
159 |
File tools = new File(userdir, "workspace-cpp/CWB-lib/src/builds/linux-64"); //$NON-NLS-1$
|
|
188 |
File tools = new File(userdir, "SVN/txm-sf/CWB/cwb-lib/src/builds/linux-64"); //$NON-NLS-1$
|
|
160 | 189 |
File registry = new File(userdir, "runtime-rcpapplication.product/corpora/NOV13-P1/registry/nov13-p1"); //$NON-NLS-1$ |
161 | 190 |
File data = new File(userdir, "runtime-rcpapplication.product/corpora/NOV13-P1/data/NOV13-P1"); //$NON-NLS-1$ |
162 | 191 |
CompressCQPIndexes.compressAll(tools, registry, "NOV13-P1", data, true); |
163 |
|
|
192 |
|
|
164 | 193 |
} catch (Exception e) { |
165 | 194 |
e.printStackTrace(); |
166 | 195 |
} |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/projects/nov13/CreateTheOtherTurns.groovy (revision 3466) | ||
---|---|---|
146 | 146 |
boolean shouldCloseOtherTurn = false; |
147 | 147 |
|
148 | 148 |
def m0 = word =~ startAndEndOtherReg |
149 |
if (m0.matches()) { |
|
149 |
def m1 = word =~ startOtherReg |
|
150 |
def m2 = word =~ endOtherReg |
|
151 |
|
|
152 |
if (word.trim().equals("*")) { |
|
153 |
if (debug) println "- ligne "+parser.getLocation().getLineNumber()+" : ouverture|fermeture de other avec '$word' -> tours '$turnInfos'" |
|
154 |
|
|
155 |
if (other) { // closing * |
|
156 |
previousOtherStarting = ["word='*' location="+getLocation(true, false, false)] |
|
157 |
|
|
158 |
shouldCloseOtherTurn = true; |
|
159 |
|
|
160 |
wordToWrite = "" |
|
161 |
} else { |
|
162 |
previousOtherStarting = ["word='*' location="+getLocation(true, false, false)] |
|
163 |
|
|
164 |
//if (other) { // don't restart a Turn if already in a Other Turn |
|
165 |
writer.writeEndElement() // current Turn |
|
166 |
writer.writeCharacters("\n") |
|
167 |
|
|
168 |
def tmpInfos = new LinkedHashMap() |
|
169 |
for (String attr : turnInfos.keySet()) tmpInfos[attr] = turnInfos[attr] |
|
170 |
tmpInfos["orig-speaker"] = turnInfos["speaker"] |
|
171 |
|
|
172 |
if (primarySpeakerIdRegex == null || turnInfos["speaker"] ==~ primarySpeakerIdRegex) { // the current speaker is not the primary speaker |
|
173 |
tmpInfos["speaker"] = otherNonPrimarySpeakerId |
|
174 |
} else { |
|
175 |
tmpInfos["speaker"] = primarySpeakerId |
|
176 |
} |
|
177 |
tmpInfos["startTime"] = currentTime |
|
178 |
writer.writeStartElement("Turn") |
|
179 |
for (String attr : tmpInfos.keySet()) { |
|
180 |
writer.writeAttribute(attr, tmpInfos[attr]) |
|
181 |
} |
|
182 |
writer.writeCharacters("\n") |
|
183 |
|
|
184 |
writer.writeStartElement("Sync") |
|
185 |
writer.writeAttribute("time", tmpInfos["startTime"]) |
|
186 |
writer.writeCharacters("\n") |
|
187 |
writer.writeEndElement() |
|
188 |
//} |
|
189 |
|
|
190 |
other = true |
|
191 |
wordToWrite = "" |
|
192 |
} |
|
193 |
} else if (m0.matches()) { |
|
150 | 194 |
if (other) { |
151 | 195 |
warnings << getLocation(true, false, false)+" with $word: Found a starting&ending * when one 'other' have been started at "+previousOtherStarting |
152 | 196 |
} |
... | ... | |
188 | 232 |
wordToWrite = m0.group(2) |
189 | 233 |
other = false |
190 | 234 |
//} |
191 |
} |
|
192 |
|
|
193 |
def m1 = word =~ startOtherReg |
|
194 |
|
|
195 |
if (!m0.matches() && m1.matches()) { // not and start&end but only a start |
|
235 |
} else if (m1.matches()) { // not and start&end but only a start |
|
196 | 236 |
|
197 | 237 |
if (other) { |
198 | 238 |
warnings << getLocation(true, false, false)+" with $word: Found a starting * when one 'other' have been started at "+previousOtherStarting |
... | ... | |
236 | 276 |
other = true |
237 | 277 |
wordToWrite = m1.group(2) |
238 | 278 |
//} |
239 |
} |
|
240 |
|
|
241 |
def m2 = word =~ endOtherReg |
|
242 |
if (!m1.matches() && !m0.matches() && m2.matches()) { |
|
279 |
} else if (m2.matches()) { |
|
243 | 280 |
if (debug) println "- ligne "+parser.getLocation().getLineNumber()+" : fermeture de other avec '$word' -> tours '$turnInfos'" |
244 | 281 |
|
245 | 282 |
if (!other) { |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/SegmentTRSInSectionFromMarkerMacro.groovy (revision 3466) | ||
---|---|---|
105 | 105 |
String content = null; |
106 | 106 |
if (node instanceof String) { |
107 | 107 |
content = node |
108 |
} else if (node instanceof groovy.util.Node && node.name() == "w" && node.text().contains(newSectionMarker)) {
|
|
108 |
} else if (node instanceof groovy.util.Node && node.name() == "w") { |
|
109 | 109 |
content = node.text().trim() |
110 | 110 |
start = Float.parseFloat(node.@time) |
111 | 111 |
} |
112 | 112 |
|
113 |
if (content.equals(newSectionMarker)) { |
|
113 |
if (content != null && (content.equals(newSectionMarker) || content.startsWith(newSectionMarker) || content.endsWith(newSectionMarker))) { |
|
114 |
|
|
114 | 115 |
if (debug) println "New section at $turn with $node child node" |
115 | 116 |
previousSection = currentSection |
116 | 117 |
currentSection = new Node(trsEpisode, "Section", new LinkedHashMap(["type":newSectionMarker, "startTime":turn.@startTime, "endTime":previousSection.attributes()["endTime"]])) |
... | ... | |
128 | 129 |
newTurn.attributes()["startTime"] = start |
129 | 130 |
turn.attributes()["endTime"] = start |
130 | 131 |
|
132 |
def syncNode = new Node(newTurn, "Sync", new LinkedHashMap()) |
|
133 |
syncNode.attributes()["time"] = start |
|
131 | 134 |
nFound++ |
132 | 135 |
|
133 | 136 |
} |
134 | 137 |
|
135 | 138 |
children.remove(i) // remove the mark |
136 | 139 |
i-- |
140 |
if (content.startsWith(newSectionMarker)) { // remove the marker and keep the tail content |
|
141 |
node.value = node.text().substring(newSectionMarker.length()) |
|
142 |
newTurn.children().add(node) |
|
143 |
} else if (content.endsWith(newSectionMarker)) { // remove the marker and keep the head content |
|
144 |
node.value = node.text().substring(0, node.text().length() - newSectionMarker.length()) |
|
145 |
newTurn.children().add(node) |
|
146 |
} |
|
137 | 147 |
|
148 |
|
|
138 | 149 |
} else if (newTurn != null) { |
139 | 150 |
turn.children().remove(i) |
140 | 151 |
i-- |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2Transcriber.groovy (revision 3466) | ||
---|---|---|
82 | 82 |
case "Speaker": // <Speaker ch="1" dur="531.38" gender="X" spkid="Enquêtrice" lang="fre" lconf="1.00" nw="1586" tconf="0.95"/> -> <Speaker id="spk1" name="enq4" check="no" dialect="native" accent="" scope="local"/> |
83 | 83 |
|
84 | 84 |
writer.writeStartElement("Speaker") |
85 |
writer.writeAttribute("id", parser.getAttributeValue(null, "spkid")) |
|
86 |
writer.writeAttribute("name", parser.getAttributeValue(null, "spkid")) |
|
85 |
writer.writeAttribute("id", parser.getAttributeValue(null, "spkid").trim())
|
|
86 |
writer.writeAttribute("name", parser.getAttributeValue(null, "spkid").trim())
|
|
87 | 87 |
writer.writeAttribute("check", "") |
88 | 88 |
writer.writeAttribute("dialect", parser.getAttributeValue(null, "lang")) |
89 | 89 |
writer.writeAttribute("accent", parser.getAttributeValue(null, "gender")) |
... | ... | |
94 | 94 |
/** |
95 | 95 |
* remove the additional speaker if already written |
96 | 96 |
*/ |
97 |
if (additionalSpeakers.containsKey(parser.getAttributeValue(null, "spkid"))) { |
|
98 |
additionalSpeakers.remove(parser.getAttributeValue(null, "spkid")) |
|
97 |
if (additionalSpeakers.containsKey(parser.getAttributeValue(null, "spkid").trim())) {
|
|
98 |
additionalSpeakers.remove(parser.getAttributeValue(null, "spkid").trim())
|
|
99 | 99 |
} |
100 | 100 |
break; |
101 | 101 |
|
... | ... | |
119 | 119 |
writer.writeStartElement("Turn") |
120 | 120 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
121 | 121 |
String name = parser.getAttributeLocalName(i) |
122 |
String value = parser.getAttributeValue(i) |
|
122 | 123 |
if (name == "stime") name = "startTime" |
123 | 124 |
else if (name == "etime") name = "endTime" |
124 |
else if (name == "spkid") name = "speaker" |
|
125 |
|
|
126 |
writer.writeAttribute(name, parser.getAttributeValue(i)) |
|
125 |
else if (name == "spkid") { |
|
126 |
name = "speaker" |
|
127 |
value = value.trim() |
|
128 |
} |
|
129 |
writer.writeAttribute(name, value) |
|
127 | 130 |
} |
128 | 131 |
|
129 | 132 |
writer.writeCharacters("\n") |
Formats disponibles : Unified diff