Révision 911
tmp/org.txm.para.rcp/.classpath (revision 911) | ||
---|---|---|
1 | 1 |
<?xml version="1.0" encoding="UTF-8"?> |
2 | 2 |
<classpath> |
3 | 3 |
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/> |
4 |
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/> |
|
4 |
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"> |
|
5 |
<accessrules> |
|
6 |
<accessrule kind="accessible" pattern="**"/> |
|
7 |
</accessrules> |
|
8 |
</classpathentry> |
|
5 | 9 |
<classpathentry kind="src" path="src"/> |
6 | 10 |
<classpathentry kind="output" path="bin"/> |
7 | 11 |
</classpath> |
tmp/org.txm.para.rcp/META-INF/MANIFEST.MF (revision 911) | ||
---|---|---|
1 | 1 |
Manifest-Version: 1.0 |
2 |
Require-Bundle: org.txm.utils;bundle-version="1.0.0";visibility:=reexp |
|
3 |
ort,org.eclipse.core.runtime;bundle-version="3.10.0";visibility:=reex |
|
4 |
port,org.eclipse.osgi;bundle-version="3.10.2";visibility:=reexport,or |
|
5 |
g.eclipse.ui;visibility:=reexport,org.txm.concordance.rcp;visibility: |
|
6 |
=reexport,org.txm.searchengine.cqp.core;bundle-version="1.1.0";visibi |
|
7 |
lity:=reexport,org.txm.rcp;bundle-version="0.7.8";visibility:=reexpor |
|
8 |
t,org.txm.core;bundle-version="0.7.0";visibility:=reexport,org.txm.pa |
|
9 |
ra.core;visibility:=reexport,org.txm.searchengine.core;bundle-version |
|
10 |
="1.0.0";visibility:=reexport |
|
2 |
Require-Bundle: org.txm.concordance.rcp;visibility:=reexport, |
|
3 |
org.txm.para.core;visibility:=reexport |
|
11 | 4 |
Export-Package: org.txm.para.rcp.editors, |
12 | 5 |
org.txm.para.rcp.handlers, |
13 | 6 |
org.txm.para.rcp.messages |
tmp/org.txm.textsbalance.feature/feature.xml (revision 911) | ||
---|---|---|
65 | 65 |
</license> |
66 | 66 |
|
67 | 67 |
<requires> |
68 |
<import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/> |
|
69 |
<import plugin="org.txm.searchengine.core" version="1.0.0" match="greaterOrEqual"/> |
|
70 |
<import plugin="org.txm.rcp" version="0.7.7" match="greaterOrEqual"/> |
|
71 |
<import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/> |
|
72 |
<import plugin="org.eclipse.ui" version="3.106.1" match="greaterOrEqual"/> |
|
73 |
<import plugin="org.eclipse.jface.text"/> |
|
74 |
<import plugin="org.eclipse.ui.editors"/> |
|
75 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
76 |
<import plugin="org.txm.chartsengine.r.core"/> |
|
77 |
<import plugin="org.eclipse.core.expressions" version="3.4.600" match="greaterOrEqual"/> |
|
78 | 68 |
<import plugin="org.txm.textsbalance.core" version="1.0.0" match="greaterOrEqual"/> |
79 |
<import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/> |
|
80 | 69 |
<import plugin="org.txm.chartsengine.rcp" version="1.0.0" match="greaterOrEqual"/> |
70 |
<import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/> |
|
71 |
<import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/> |
|
72 |
<import plugin="org.txm.chartsengine.r.core" version="1.0.0" match="greaterOrEqual"/> |
|
81 | 73 |
</requires> |
82 | 74 |
|
83 | 75 |
<plugin |
tmp/org.txm.para.core/.classpath (revision 911) | ||
---|---|---|
1 | 1 |
<?xml version="1.0" encoding="UTF-8"?> |
2 | 2 |
<classpath> |
3 | 3 |
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/> |
4 |
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/> |
|
4 |
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"> |
|
5 |
<accessrules> |
|
6 |
<accessrule kind="accessible" pattern="**"/> |
|
7 |
</accessrules> |
|
8 |
</classpathentry> |
|
5 | 9 |
<classpathentry kind="src" path="src"/> |
6 | 10 |
<classpathentry kind="output" path="bin"/> |
7 | 11 |
</classpath> |
tmp/org.txm.para.core/META-INF/MANIFEST.MF (revision 911) | ||
---|---|---|
1 | 1 |
Manifest-Version: 1.0 |
2 |
Require-Bundle: org.txm.libs.groovy-all;bundle-version="2.3.3";visibil |
|
3 |
ity:=reexport,org.txm.utils;bundle-version="1.0.0";visibility:=reexpo |
|
4 |
rt,org.eclipse.osgi;bundle-version="3.10.2";visibility:=reexport,org. |
|
5 |
eclipse.core.runtime;bundle-version="3.10.0";visibility:=reexport,org |
|
6 |
.txm.searchengine.cqp.core;bundle-version="1.1.0";visibility:=reexpor |
|
7 |
t,org.txm.core;bundle-version="0.7.0";visibility:=reexport,org.eclips |
|
8 |
e.ui;visibility:=reexport,org.txm.searchengine.core;bundle-version="1 |
|
9 |
.0.0";visibility:=reexport |
|
2 |
Require-Bundle: org.txm.searchengine.cqp.core;bundle-version="1.1.0";visibility:=reexport |
|
10 | 3 |
Export-Package: org.txm.para.core.functions |
11 | 4 |
Bundle-ActivationPolicy: lazy |
12 | 5 |
Bundle-Version: 1.0.0.qualifier |
tmp/org.txm.treetagger.files.feature/feature.xml (revision 911) | ||
---|---|---|
69 | 69 |
</license> |
70 | 70 |
|
71 | 71 |
<requires> |
72 |
<import plugin="org.txm.utils"/> |
|
73 |
<import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/> |
|
74 | 72 |
<import plugin="org.txm.core" version="0.8.0" match="greaterOrEqual"/> |
75 | 73 |
<import plugin="org.txm.treetagger.core" version="1.0.0" match="greaterOrEqual"/> |
76 |
<import plugin="org.eclipse.ui"/> |
|
77 |
<import plugin="org.eclipse.swt"/> |
|
78 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
79 | 74 |
<import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/> |
80 |
<import plugin="org.txm.libs.groovy-all" version="2.3.3" match="greaterOrEqual"/> |
|
81 |
<import plugin="org.txm.utils" version="1.0.0" match="greaterOrEqual"/> |
|
82 | 75 |
</requires> |
83 | 76 |
|
84 | 77 |
<plugin |
tmp/org.txm.analec.rcp/src/org/txm/analec/imports/AnalecAnnotationsImporter.java (revision 911) | ||
---|---|---|
18 | 18 |
import org.apache.commons.lang.StringUtils; |
19 | 19 |
import org.eclipse.core.runtime.IProgressMonitor; |
20 | 20 |
import org.txm.Toolbox; |
21 |
import org.txm.importer.graal.PersonalNamespaceContext;
|
|
21 |
import org.txm.importer.PersonalNamespaceContext; |
|
22 | 22 |
import org.txm.searchengine.cqp.AbstractCqiClient; |
23 | 23 |
import org.txm.searchengine.cqp.CQPSearchEngine; |
24 | 24 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
tmp/org.txm.analec.rcp/src/org/txm/analec/imports/DOMAnalecAnnotationsImporter.java (revision 911) | ||
---|---|---|
8 | 8 |
|
9 | 9 |
import org.eclipse.core.runtime.IProgressMonitor; |
10 | 10 |
import org.txm.Toolbox; |
11 |
import org.txm.importer.graal.PersonalNamespaceContext;
|
|
11 |
import org.txm.importer.PersonalNamespaceContext; |
|
12 | 12 |
import org.txm.searchengine.cqp.AbstractCqiClient; |
13 | 13 |
import org.txm.searchengine.cqp.CQPSearchEngine; |
14 | 14 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
tmp/org.txm.analec.rcp/src/org/txm/analec/export/AnalecAnnotationTEIExporter.java (revision 911) | ||
---|---|---|
17 | 17 |
import org.txm.Toolbox; |
18 | 18 |
import org.txm.importer.StaxIdentityParser; |
19 | 19 |
import org.txm.importer.StaxStackWriter; |
20 |
import org.txm.importer.graal.PersonalNamespaceContext;
|
|
20 |
import org.txm.importer.PersonalNamespaceContext; |
|
21 | 21 |
import org.txm.objects.BaseParameters; |
22 | 22 |
import org.txm.rcp.Application; |
23 | 23 |
import org.txm.rcp.TxmPreferences; |
tmp/org.txm.wordcloud.feature/feature.xml (revision 911) | ||
---|---|---|
67 | 67 |
</license> |
68 | 68 |
|
69 | 69 |
<requires> |
70 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
71 |
|
|
72 |
<import plugin="org.txm.utils" version="1.0.0" match="greaterOrEqual"/> |
|
73 |
<import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/> |
|
74 |
<import plugin="org.txm.chartsengine.core" version="1.0.0" match="greaterOrEqual"/> |
|
75 |
<import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/> |
|
76 |
<import plugin="org.txm.statsengine.r.core" version="1.0.0" match="greaterOrEqual"/> |
|
77 | 70 |
<import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/> |
78 |
<import plugin="org.txm.statsengine.core" version="1.0.0" match="greaterOrEqual"/> |
|
79 | 71 |
<import plugin="org.txm.chartsengine.r.core" version="1.0.0" match="greaterOrEqual"/> |
80 | 72 |
<import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/> |
81 |
<import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/> |
|
82 |
<import plugin="org.eclipse.osgi" version="3.10.2" match="greaterOrEqual"/> |
|
83 | 73 |
<import plugin="org.txm.index.rcp" version="1.0.0" match="greaterOrEqual"/> |
84 |
<import plugin="org.eclipse.ui" version="3.106.1" match="greaterOrEqual"/> |
|
85 | 74 |
<import plugin="org.txm.wordcloud.core" version="1.0.0" match="greaterOrEqual"/> |
86 | 75 |
<import plugin="org.txm.chartsengine.rcp"/> |
87 | 76 |
</requires> |
tmp/org.txm.dictionary.feature/feature.xml (revision 911) | ||
---|---|---|
17 | 17 |
</license> |
18 | 18 |
|
19 | 19 |
<requires> |
20 |
<import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/> |
|
21 |
<import plugin="org.txm.libs.groovy-all" version="2.3.3" match="greaterOrEqual"/> |
|
22 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
23 |
<import plugin="org.eclipse.ui"/> |
|
24 |
<import plugin="org.eclipse.core.runtime"/> |
|
25 |
<import plugin="org.txm.utils"/> |
|
26 |
<import plugin="org.eclipse.persistence.jpa" version="2.6.0" match="greaterOrEqual"/> |
|
27 |
<import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/> |
|
28 | 20 |
<import plugin="org.txm.index.rcp" version="1.0.0" match="greaterOrEqual"/> |
29 |
<import plugin="javax.persistence"/> |
|
30 | 21 |
<import plugin="org.txm.annotation.kr.core" version="1.0.0" match="greaterOrEqual"/> |
31 | 22 |
</requires> |
32 | 23 |
|
tmp/org.txm.wordcloud.rcp/META-INF/MANIFEST.MF (revision 911) | ||
---|---|---|
1 | 1 |
Manifest-Version: 1.0 |
2 |
Require-Bundle: org.txm.rcp;bundle-version="0.7.8";visibility:=reexpor |
|
3 |
t,org.txm.utils;bundle-version="1.0.0";visibility:=reexport,org.eclip |
|
4 |
se.osgi;bundle-version="3.10.2";visibility:=reexport,org.txm.index.rc |
|
5 |
p;bundle-version="1.0.0";visibility:=reexport,org.eclipse.core.runtim |
|
6 |
e;bundle-version="3.10.0";visibility:=reexport,org.txm.chartsengine.c |
|
7 |
ore;bundle-version="1.0.0";visibility:=reexport,org.txm.index.core;bu |
|
8 |
ndle-version="1.0.0";visibility:=reexport,org.txm.searchengine.cqp.co |
|
9 |
re;bundle-version="1.1.0";visibility:=reexport,org.eclipse.ui;bundle- |
|
10 |
version="3.106.1";visibility:=reexport,org.txm.wordcloud.core;bundle- |
|
11 |
version="1.0.0";visibility:=reexport,org.txm.chartsengine.rcp;visibil |
|
12 |
ity:=reexport,org.txm.core;bundle-version="0.7.0";visibility:=reexpor |
|
13 |
t |
|
2 |
Require-Bundle: org.txm.index.rcp;bundle-version="1.0.0";visibility:=reexport, |
|
3 |
org.txm.wordcloud.core;bundle-version="1.0.0";visibility:=reexport, |
|
4 |
org.txm.chartsengine.rcp;visibility:=reexport |
|
14 | 5 |
Bundle-Vendor: Textometrie.org |
15 | 6 |
Bundle-ActivationPolicy: lazy |
16 | 7 |
Bundle-Version: 1.0.0.qualifier |
tmp/org.txm.tigersearch.feature/feature.xml (revision 911) | ||
---|---|---|
17 | 17 |
</license> |
18 | 18 |
|
19 | 19 |
<requires> |
20 |
<import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/> |
|
21 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
22 |
<import plugin="org.eclipse.ui"/> |
|
23 |
<import plugin="org.eclipse.core.runtime"/> |
|
24 |
<import plugin="org.eclipse.ui.editors" version="3.8.200" match="greaterOrEqual"/> |
|
25 |
<import plugin="org.eclipse.swt"/> |
|
26 |
<import plugin="org.txm.searchengine.core" version="1.0.0" match="greaterOrEqual"/> |
|
27 |
<import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/> |
|
28 |
<import plugin="org.txm.statsengine.core" version="1.0.0" match="greaterOrEqual"/> |
|
29 |
<import plugin="org.txm.statsengine.r.core" version="1.0.0" match="greaterOrEqual"/> |
|
30 |
<import plugin="org.txm.statsengine.r.rcp" version="1.0.0" match="greaterOrEqual"/> |
|
31 |
<import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/> |
|
32 | 20 |
<import plugin="org.txm.index.rcp" version="1.0.0" match="greaterOrEqual"/> |
33 |
<import plugin="org.txm.utils"/> |
|
34 |
<import plugin="org.txm.groovy.core" version="1.0.0" match="greaterOrEqual"/> |
|
35 | 21 |
</requires> |
36 | 22 |
|
37 | 23 |
<plugin |
tmp/org.txm.cql2lsa.feature/feature.xml (revision 911) | ||
---|---|---|
17 | 17 |
</license> |
18 | 18 |
|
19 | 19 |
<requires> |
20 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
21 | 20 |
<import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/> |
22 |
<import plugin="org.eclipse.ui"/> |
|
23 |
<import plugin="org.eclipse.core.runtime"/> |
|
24 |
<import plugin="org.eclipse.ui.editors" version="3.8.100" match="greaterOrEqual"/> |
|
25 |
<import plugin="org.eclipse.core.expressions" version="3.4.500" match="greaterOrEqual"/> |
|
26 |
<import plugin="org.txm.index.core"/> |
|
27 |
<import plugin="org.txm.lexicaltable.core"/> |
|
28 |
<import plugin="org.txm.statsengine.r.core"/> |
|
29 | 21 |
</requires> |
30 | 22 |
|
31 | 23 |
<plugin |
tmp/org.txm.tigersearch.rcp/.settings/org.eclipse.jdt.groovy.core.prefs (revision 911) | ||
---|---|---|
1 | 1 |
eclipse.preferences.version=1 |
2 |
groovy.compiler.level=23 |
|
2 |
groovy.compiler.level=-1 |
|
3 |
groovy.script.filters=**/*.dsld,y,**/*.gradle,n |
tmp/org.txm.tigersearch.rcp/META-INF/MANIFEST.MF (revision 911) | ||
---|---|---|
131 | 131 |
ls.debugger,org.mozilla.javascript.tools.idswitch,org.mozilla.javascr |
132 | 132 |
ipt.tools.jsc,org.mozilla.javascript.tools.shell,org.relaxng.datatype |
133 | 133 |
,org.relaxng.datatype.helpers,org.txm.export.ts,org.txm.function.tige |
134 |
rsearch,org.txm.importer.srcmf,org.txm.importer.tigersearch,org.txm.s
|
|
134 |
rsearch,org.txm.s |
|
135 | 135 |
earchengine.ts,org.txm.test,org.txm.tigersearch.commands,org.txm.tige |
136 | 136 |
rsearch.editors,org.w3c.css.sac,org.w3c.css.sac.helpers,org.w3c.dom.s |
137 | 137 |
mil,org.w3c.dom.svg,tigerAPI,tigerAPI.converters,tigerAPI.theories.ho |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/PunctInjectImport.groovy (revision 911) | ||
---|---|---|
1 |
// Script to restore punctuation etc. to results of TS query. |
|
2 |
// Inputs: |
|
3 |
// --- Tiger-XML Document node |
|
4 |
// --- Java array: |
|
5 |
// --- --- String [index][type] where: |
|
6 |
// --- --- --- type == 0 gives the xml:id |
|
7 |
// --- --- --- type == 1 gives the word form |
|
8 |
// Process: |
|
9 |
// --- Injects punctuation. |
|
10 |
// Returns: |
|
11 |
// --- Tiger-XML Document node. |
|
12 |
|
|
13 |
// TXM package statement |
|
14 |
package org.txm.importer.srcmf |
|
15 |
|
|
16 |
import javax.xml.parsers.DocumentBuilderFactory |
|
17 |
|
|
18 |
|
|
19 |
|
|
20 |
/** |
|
21 |
* |
|
22 |
* @author tmr |
|
23 |
* |
|
24 |
*/ |
|
25 |
class PunctInjectImport |
|
26 |
{ |
|
27 |
|
|
28 |
static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml') |
|
29 |
static nsTei = new groovy.xml.Namespace('http://www.tei-c.org/ns/1.0', 'tei') |
|
30 |
def tigerXml = null |
|
31 |
def txmIdWordTableFixed = [] |
|
32 |
def outputFile = null |
|
33 |
|
|
34 |
/* |
|
35 |
* Call this constructor when no need of XML-TXM file for word ID table |
|
36 |
*/ |
|
37 |
PunctInjectImport(File tigerXmlFile, ArrayList txmIdWordTable) { |
|
38 |
def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder() |
|
39 |
tigerXml = builder.parse(tigerXmlFile).documentElement |
|
40 |
txmIdWordTableFixed = txmIdWordTable |
|
41 |
outputFile = new File("output.xml") |
|
42 |
} |
|
43 |
|
|
44 |
/* |
|
45 |
* Call this constructor when the XML-TXM file is needed for the word list. |
|
46 |
*/ |
|
47 |
PunctInjectImport(File tigerXmlFile, File xmlTxmFile) { |
|
48 |
// |
|
49 |
def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder() |
|
50 |
tigerXml = builder.parse(tigerXmlFile).documentElement |
|
51 |
txmIdWordTableFixed = [] |
|
52 |
def xmlTxmGroovyDOM = new XmlParser().parse(xmlTxmFile) |
|
53 |
for (def w : xmlTxmGroovyDOM.text.'**'.findAll { it.name() == nsTei.'w' } ) { |
|
54 |
def anId = null |
|
55 |
anId = w.'@id' |
|
56 |
if (! anId) { |
|
57 |
anId = w.attribute(nsXml.id) |
|
58 |
} |
|
59 |
if (! anId) { |
|
60 |
println "Error: ID attribute not located in XML TXM file" |
|
61 |
} |
|
62 |
txmIdWordTableFixed << [anId, getDeepText(w."txm:form"[0])] |
|
63 |
} |
|
64 |
// assert txmIdWordTableFixed.size() > 0 |
|
65 |
outputFile = new File("output.xml") |
|
66 |
} |
|
67 |
|
|
68 |
def process () { |
|
69 |
def txmIdWordTable = new ArrayList(this.txmIdWordTableFixed) |
|
70 |
def allTs = toList(this.tigerXml.getElementsByTagName('t')) |
|
71 |
def tAttrs = getTAttrNames(allTs[0]) |
|
72 |
def tNodeIdPrefix = allTs[0].getAttribute('id').tokenize('#').first() |
|
73 |
def addedWordIds = [] |
|
74 |
int tIx = 0 |
|
75 |
while (txmIdWordTable) { |
|
76 |
def word = txmIdWordTable.remove(0) |
|
77 |
if (tIx == allTs.size()) { |
|
78 |
// End of TS file, but still words left in the BFM file. |
|
79 |
addTNode(word, allTs.last(), tAttrs, tNodeIdPrefix, 'append') |
|
80 |
} |
|
81 |
else { |
|
82 |
def tNode = allTs[tIx] |
|
83 |
def tId = getTNodeId(tNode) |
|
84 |
if (tId == word[0] && tNode.getAttribute('word') == word[1]) { |
|
85 |
// alles gut |
|
86 |
tIx += 1 |
|
87 |
} |
|
88 |
else if (tId == word[0]) { |
|
89 |
println("Mismatched Ids! ($tId)") |
|
90 |
tIx += 1 |
|
91 |
} |
|
92 |
else if (['#', '*'].contains(tNode.getAttribute('word')) ) { |
|
93 |
// SRCMF duplicata; try comparing word against the next tNode next time |
|
94 |
// around. |
|
95 |
txmIdWordTable.add(0, word) |
|
96 |
tIx += 1 |
|
97 |
} |
|
98 |
// Check that the SRCMF corpus doesn't have a bug in it... |
|
99 |
else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/) |
|
100 |
&& (allTs[0..tIx - 1].find{ |
|
101 |
it.getAttribute('id') == "$tNodeIdPrefix#$tId" |
|
102 |
})) { |
|
103 |
println "Warning: word ${tNode.getAttribute('word')}, id $tId appears twice in corpus!" |
|
104 |
txmIdWordTable.add(0, word) |
|
105 |
tIx += 1 |
|
106 |
} |
|
107 |
// Check that there's not an extra word in the SRCMF corpus (rare, usu. a tokenisation change) |
|
108 |
else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/) |
|
109 |
&& (allTs[tIx..-1].find{ |
|
110 |
it.getAttribute('id') == "$tNodeIdPrefix#${word[0]}" |
|
111 |
})) { |
|
112 |
println "Warning: word ${tNode.getAttribute('word')}, id $tId does not appear in BFM!" |
|
113 |
txmIdWordTable.add(0, word) |
|
114 |
tIx += 1 |
|
115 |
} |
|
116 |
else if (addedWordIds.contains(tId)) { |
|
117 |
println "Warning: word ${tNode.getAttribute('word')}, id ${tId} out of sequence in SRCMF corpus!" |
|
118 |
txmIdWordTable.add(0, word) |
|
119 |
tIx += 1 |
|
120 |
} |
|
121 |
else { |
|
122 |
// Insert word. In the first instance, it will have the same parent as |
|
123 |
// the tNode before which it's being inserted. |
|
124 |
addTNode(word, allTs[tIx], tAttrs, tNodeIdPrefix, 'before') |
|
125 |
addedWordIds.add(word[0]) |
|
126 |
} |
|
127 |
} |
|
128 |
} |
|
129 |
// Second phase: move punctuation into previous sentence, |
|
130 |
// dependent on sequence. |
|
131 |
def allTerminalses = toList(this.tigerXml.getElementsByTagName('terminals')) |
|
132 |
for (def i = 1 ; i < allTerminalses.size() ; i++) { |
|
133 |
def ts = toList(allTerminalses[i].getElementsByTagName('t')) |
|
134 |
def startPunc = true |
|
135 |
def puncStack = [] |
|
136 |
while (ts && startPunc) { |
|
137 |
if ((ts[0].getAttribute('word') =~ /[A-zÀ-ÿ0-9]/).size() == 0) { |
|
138 |
puncStack.add(ts.remove(0)) |
|
139 |
} |
|
140 |
else { |
|
141 |
startPunc = false |
|
142 |
} |
|
143 |
} |
|
144 |
// Now, treat the punctuation stack at the beginning of the sentence |
|
145 |
if ( puncStack ) { |
|
146 |
int moveLeft = 0 |
|
147 |
// First, identify LAST instance of sentence-final punctuation. |
|
148 |
def puncString = puncStack.collect{ it.getAttribute('word')[0] }.join('') |
|
149 |
def matches = puncString =~ /[\.\,\;\:\!\?\)\]\}»”’]/ |
|
150 |
if (matches.size() > 0) { |
|
151 |
moveLeft = puncString.lastIndexOf(matches[-1]) + 1 |
|
152 |
} |
|
153 |
// Second, split pairs of straight quotes |
|
154 |
matches = puncString =~ /(""|'')/ //" |
|
155 |
if (matches.size() > 0) { |
|
156 |
moveLeft = [moveLeft, puncString.lastIndexOf(matches[-1][0]) + 1].max() |
|
157 |
} |
|
158 |
// Now, move moveLeft punctuation nodes to the end of the prev. sentence |
|
159 |
ts = toList(allTerminalses[i].getElementsByTagName('t')) |
|
160 |
for (def j = 0 ; j < moveLeft ; j++ ) { |
|
161 |
allTerminalses[i - 1].appendChild(ts[j]) |
|
162 |
} |
|
163 |
} |
|
164 |
} |
|
165 |
outputFile.withWriter("UTF-8") { writer -> |
|
166 |
writer.println(this.tigerXml) |
|
167 |
} |
|
168 |
} |
|
169 |
|
|
170 |
private addTNode(word, tNode, tAttrs, tNodeIdPrefix, where) { |
|
171 |
def newTNode = tNode.getOwnerDocument().createElement('t') |
|
172 |
for (def anAttr : tAttrs) { |
|
173 |
if (anAttr == 'id') { |
|
174 |
newTNode.setAttribute('id', "${tNodeIdPrefix}#${word[0]}") |
|
175 |
} |
|
176 |
else if (anAttr == 'word') { |
|
177 |
newTNode.setAttribute('word', word[1]) |
|
178 |
} |
|
179 |
else { |
|
180 |
newTNode.setAttribute(anAttr, '--') |
|
181 |
} |
|
182 |
} |
|
183 |
if (where == 'before') { |
|
184 |
tNode.getParentNode().insertBefore(newTNode, tNode) |
|
185 |
} |
|
186 |
else if (where == 'append') { |
|
187 |
tNode.getParentNode().appendChild(newTNode) |
|
188 |
} |
|
189 |
else { |
|
190 |
throw new IllegalArgumentException('Bad before value') |
|
191 |
} |
|
192 |
} |
|
193 |
def getTAttrNames(tNode) { |
|
194 |
def nodeMap = tNode.attributes |
|
195 |
def nameList = [] |
|
196 |
for ( def i = 0 ; i < nodeMap.getLength() ; i++ ) { |
|
197 |
nameList.add( nodeMap.item(i).nodeName ) |
|
198 |
} |
|
199 |
return nameList |
|
200 |
} |
|
201 |
def getTNodeId(tNode) { |
|
202 |
return tNode.getAttribute('id').tokenize('#').last() |
|
203 |
} |
|
204 |
def toList(def iterable) { |
|
205 |
return iterable.findAll {true}; |
|
206 |
} |
|
207 |
def getDeepText(def node) { |
|
208 |
def str = "" |
|
209 |
node.children().each { child -> |
|
210 |
if (child instanceof Node) { |
|
211 |
str += getDeepText(child) |
|
212 |
} else if (child instanceof String) { |
|
213 |
str += child |
|
214 |
} |
|
215 |
} |
|
216 |
return str |
|
217 |
} |
|
218 |
|
|
219 |
public static void main(String[] args) |
|
220 |
{ |
|
221 |
def cli = new CliBuilder( |
|
222 |
usage:'PunctInjectImport.groovy TigerXml.xml XmlTxm.xml' |
|
223 |
) |
|
224 |
cli.h(longOpt:'help', 'Prints this message.') |
|
225 |
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') |
|
226 |
def options = cli.parse(args) |
|
227 |
def tigerXmlFile = null |
|
228 |
def xmlTxmFile = null |
|
229 |
if (options.arguments().size() == 2) { |
|
230 |
tigerXmlFile = new File(options.arguments()[0]) |
|
231 |
xmlTxmFile = new File(options.arguments()[1]) |
|
232 |
} else { |
|
233 |
println 'Incorrect number of command line arguments... exiting' |
|
234 |
println cli.usage() |
|
235 |
System.exit(2) |
|
236 |
} |
|
237 |
def pii = new PunctInjectImport(tigerXmlFile, xmlTxmFile) |
|
238 |
if (options.o) { |
|
239 |
pii.outputFile = new File(options.o) |
|
240 |
} |
|
241 |
pii.process() |
|
242 |
} |
|
243 |
} |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/srcmfLoader.groovy (revision 911) | ||
---|---|---|
1 |
package org.txm.importer.srcmf |
|
2 |
|
|
3 |
// srcdir |
|
4 |
// txml/master.xml |
|
5 |
// txml/*.xml |
|
6 |
// txm/*.xml |
|
7 |
|
|
8 |
// 1 import xml-txm |
|
9 |
import ims.tiger.gui.tigerregistry.TIGERRegistry; |
|
10 |
import ims.tiger.index.writer.IndexBuilderErrorHandler; |
|
11 |
import ims.tiger.index.writer.SimpleErrorHandler; |
|
12 |
import ims.tiger.index.writer.XMLIndexing; |
|
13 |
import ims.tiger.system.*; |
|
14 |
|
|
15 |
import javax.xml.stream.*; |
|
16 |
|
|
17 |
import java.io.BufferedWriter; |
|
18 |
import java.io.File; |
|
19 |
import java.io.FileOutputStream; |
|
20 |
import java.io.IOException; |
|
21 |
import java.io.OutputStreamWriter; |
|
22 |
import java.io.PrintStream; |
|
23 |
import java.net.URL; |
|
24 |
import java.util.HashMap; |
|
25 |
import java.util.logging.FileHandler |
|
26 |
import java.io.File; |
|
27 |
|
|
28 |
import org.apache.log4j.BasicConfigurator; |
|
29 |
import org.txm.export.ts.*; |
|
30 |
import org.txm.importer.ApplyXsl2; |
|
31 |
import org.txm.utils.xml.DomUtils; |
|
32 |
import org.txm.importer.ValidateXml; |
|
33 |
import org.txm.importer.XPathResult; |
|
34 |
import org.txm.importer.xmltxm.compiler; |
|
35 |
import org.txm.importer.xml.pager; |
|
36 |
import org.txm.objects.*; |
|
37 |
import org.txm.utils.*; |
|
38 |
import org.txm.*; |
|
39 |
import org.txm.scripts.teitxm.*; |
|
40 |
import org.txm.metadatas.*; |
|
41 |
import org.txm.utils.i18n.*; |
|
42 |
import org.w3c.dom.Element |
|
43 |
import org.xml.sax.SAXException; |
|
44 |
|
|
45 |
import javax.xml.parsers.DocumentBuilderFactory |
|
46 |
|
|
47 |
String userDir = System.getProperty("user.home"); |
|
48 |
boolean debug = false |
|
49 |
def MONITOR; |
|
50 |
BaseParameters params; |
|
51 |
try {params = paramsBinding;MONITOR=monitor} catch (Exception) |
|
52 |
{ println "DEV MODE";//exception means we debug |
|
53 |
debug = true |
|
54 |
params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml")) |
|
55 |
params.load() |
|
56 |
if (!org.txm.Toolbox.isInitialized()) { |
|
57 |
Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM")); |
|
58 |
Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger")); |
|
59 |
Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models")); |
|
60 |
Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8"); |
|
61 |
Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ","); |
|
62 |
Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\""); |
|
63 |
Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM")); |
|
64 |
} |
|
65 |
} |
|
66 |
if (params == null) { println "no parameters. Aborting"; return; } |
|
67 |
|
|
68 |
String corpusname = params.getCorpusName(); |
|
69 |
Element corpusElem = params.corpora.get(corpusname); |
|
70 |
String basename = params.name; |
|
71 |
String rootDir = params.rootDir; |
|
72 |
String lang = corpusElem.getAttribute("lang"); |
|
73 |
String model = lang |
|
74 |
String encoding = corpusElem.getAttribute("encoding"); |
|
75 |
boolean annotate = "true" == corpusElem.getAttribute("annotate"); |
|
76 |
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl") |
|
77 |
def xslParams = params.getXsltParams(corpusElem); |
|
78 |
|
|
79 |
File txmSrcDir = new File(rootDir, "xml-txm"); |
|
80 |
File tigerSrcDir = new File(rootDir, "tiger-xml"); |
|
81 |
File headerFile = new File(rootDir, "tiger-xml/header.xml"); |
|
82 |
|
|
83 |
File binDir = new File(Toolbox.getTXMHOMEPATH(),"corpora/"+basename); |
|
84 |
|
|
85 |
binDir.deleteDir(); |
|
86 |
binDir.mkdirs(); |
|
87 |
if (!binDir.exists()) { |
|
88 |
println "Could not create binDir "+binDir |
|
89 |
return; |
|
90 |
} |
|
91 |
if (!txmSrcDir.exists()) { |
|
92 |
println "Error: txm src dir does not exists: "+txmSrcDir |
|
93 |
return; |
|
94 |
} |
|
95 |
if (!tigerSrcDir.exists()) { |
|
96 |
println "Error: tiger src dir does not exists: "+tigerSrcDir |
|
97 |
return; |
|
98 |
} |
|
99 |
File txmDir = new File(binDir,"txm/$corpusname"); |
|
100 |
txmDir.deleteDir(); |
|
101 |
txmDir.mkdirs(); |
|
102 |
// copy txm files |
|
103 |
List<File> srcfiles = txmSrcDir.listFiles(); |
|
104 |
for (File f : srcfiles) {// check XML format, and copy file into binDir |
|
105 |
if (f.getName().equals("import.xml") || f.getName().equals("metadata.csv") || f.getName().endsWith(".properties")) |
|
106 |
continue; |
|
107 |
if (ValidateXml.test(f)) { |
|
108 |
FileCopy.copy(f, new File(txmDir, f.getName())); |
|
109 |
} else { |
|
110 |
println "Won't process file "+f; |
|
111 |
} |
|
112 |
} |
|
113 |
if (txmDir.listFiles() == null) { |
|
114 |
println "No txm file to process" |
|
115 |
return; |
|
116 |
} |
|
117 |
|
|
118 |
List<File> filelist = txmDir.listFiles(); |
|
119 |
Collections.sort(filelist); |
|
120 |
|
|
121 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
122 |
if (MONITOR != null) MONITOR.worked(25, "COMPILING") |
|
123 |
println "-- COMPILING - Building Search Engine indexes" |
|
124 |
println "binDir: $binDir" |
|
125 |
println "txmDir: $txmDir" |
|
126 |
def c = new compiler(); |
|
127 |
if (debug) c.setDebug(); |
|
128 |
c.setLang(lang); |
|
129 |
//c.setSortMetadata(sortMetadata) |
|
130 |
if (!c.run(binDir, txmDir, basename, corpusname, filelist)) { |
|
131 |
println "import process stopped"; |
|
132 |
return; |
|
133 |
} |
|
134 |
|
|
135 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
136 |
if (MONITOR != null) MONITOR.worked(20, "EDITION") |
|
137 |
println "-- EDITION - Building edition" |
|
138 |
new File(binDir,"HTML/$corpusname").deleteDir(); |
|
139 |
new File(binDir,"HTML/$corpusname").mkdirs(); |
|
140 |
File outdir = new File(binDir,"/HTML/$corpusname/default/"); |
|
141 |
outdir.mkdirs(); |
|
142 |
|
|
143 |
|
|
144 |
def second = 0 |
|
145 |
println "Paginating text: " |
|
146 |
for (File srcfile : filelist) { |
|
147 |
String txtname = srcfile.getName(); |
|
148 |
int i = txtname.lastIndexOf("."); |
|
149 |
if(i > 0) txtname = txtname.substring(0, i); |
|
150 |
List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang); |
|
151 |
List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang); |
|
152 |
Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile); |
|
153 |
if (second) { print(", ") } |
|
154 |
if (second > 0 && (second++ % 5) == 0) println "" |
|
155 |
print(srcfile.getName()); |
|
156 |
def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb"); |
|
157 |
Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html"); |
|
158 |
for (i = 0 ; i < ed.getPageFiles().size();) { |
|
159 |
File f = ed.getPageFiles().get(i); |
|
160 |
String wordid = ed.getIdx().get(i); |
|
161 |
params.addPage(edition, ""+(++i), wordid); |
|
162 |
} |
|
163 |
} |
|
164 |
|
|
165 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
166 |
if (MONITOR != null) MONITOR.worked(20, "FINALIZING") |
|
167 |
File paramFile = new File(binDir, "import.xml"); |
|
168 |
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true; |
|
169 |
|
|
170 |
|
|
171 |
// TMR Modifications start here |
|
172 |
File tigerDir = new File(binDir, "tiger"); |
|
173 |
tigerDir.mkdir(); |
|
174 |
File master = new File(tigerSrcDir, "master.xml"); |
|
175 |
File txmSrcFile = txmSrcDir.listFiles()[0]; // only one text file |
|
176 |
|
|
177 |
// Run SRCMF cmd line corpus import script |
|
178 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
179 |
def importer = new SrcmfImporter() |
|
180 |
def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir) |
|
181 |
|
|
182 |
// Create TigerXml-POS-PNC |
|
183 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
184 |
println "create TIGER XML POS PNC" |
|
185 |
PunctInjectImport injector; |
|
186 |
if (Toolbox.isInitialized()) { |
|
187 |
// get idx from CQP |
|
188 |
// INCOMPLETE |
|
189 |
// def injector = new PunctInjectImport(tmpFile, words) |
|
190 |
println "Error: the loader is not ready to be launched from RCP" |
|
191 |
return; |
|
192 |
} else { |
|
193 |
injector = new PunctInjectImport(tigerXmlAll, txmSrcFile) |
|
194 |
} |
|
195 |
injector.outputFile = new File(tigerDir, "TigerPnc.xml") |
|
196 |
injector.process() |
|
197 |
|
|
198 |
// Create the tigersearch.logprop file (used to launch TIGERSearch) |
|
199 |
File logprop = new File(tigerDir, "tigersearch.logprop"); |
|
200 |
logprop.withWriter("UTF-8") { writer -> |
|
201 |
writer.write("""# Default log configuration of the TIGERSearch suite |
|
202 |
|
|
203 |
log4j.rootLogger=WARN,Logfile |
|
204 |
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO |
|
205 |
|
|
206 |
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender |
|
207 |
log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log |
|
208 |
log4j.appender.Logfile.MaxFileSize=500KB |
|
209 |
log4j.appender.Logfile.MaxBackupIndex=1 |
|
210 |
|
|
211 |
log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout |
|
212 |
log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""") |
|
213 |
} |
|
214 |
|
|
215 |
// Run TigerRegistry |
|
216 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); } |
|
217 |
BasicConfigurator.configure(); |
|
218 |
String uri = tigerXmlAll.getAbsolutePath(); |
|
219 |
File tigerBinDir = new File(tigerDir, corpusname) |
|
220 |
tigerBinDir.mkdir() |
|
221 |
try { |
|
222 |
IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath()); |
|
223 |
XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false); |
|
224 |
indexing.startIndexing(); |
|
225 |
} |
|
226 |
catch (IOException e) { System.out.println("IO: "+e.getMessage()); } |
|
227 |
catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); } |
|
228 |
|
|
229 |
tigerXmlAll.delete() |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/TigerSubcorpus2Main.groovy (revision 911) | ||
---|---|---|
1 |
/* |
|
2 |
Combines subcorpus files to give a single main file. |
|
3 |
*/ |
|
4 |
// TXM package statement |
|
5 |
package org.txm.importer.srcmf |
|
6 |
|
|
7 |
// Command-line entry point |
|
8 |
|
|
9 |
def cli = new CliBuilder( |
|
10 |
usage:'TigerSubcorpus2Main.groovy [options] tiger_master.xml' |
|
11 |
) |
|
12 |
cli.h(longOpt:'help', 'Prints this message.') |
|
13 |
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') |
|
14 |
options = cli.parse(args) |
|
15 |
if (options.arguments().size() != 1) { |
|
16 |
println 'Incorrect number of command line arguments... exiting' |
|
17 |
println cli.usage() |
|
18 |
System.exit(2) |
|
19 |
} |
|
20 |
def tigerMaster = new File(options.arguments()[0]) |
|
21 |
def outputFile = new File('outputfile.xml') |
|
22 |
if (options.o) { |
|
23 |
outputFile = new File(options.o) |
|
24 |
} |
|
25 |
script(tigerMaster, outputFile) |
|
26 |
|
|
27 |
def script (File tigerMaster, File outputFile) { |
|
28 |
def masterDoc = new XmlParser().parse(tigerMaster) |
|
29 |
for (def body : masterDoc.body) { |
|
30 |
for (def subNode : body.subcorpus) { |
|
31 |
File subCorpusFile = new File(tigerMaster.getParentFile(), (""+subNode.@external).substring(5)) |
|
32 |
def subDoc = new XmlParser().parse(subCorpusFile) |
|
33 |
body.remove(subNode) |
|
34 |
for (def child : subDoc.children()) |
|
35 |
body.append(child) |
|
36 |
} |
|
37 |
} |
|
38 |
def strWriter = new StringWriter() |
|
39 |
new groovy.util.XmlNodePrinter(new PrintWriter(strWriter)).print(masterDoc) |
|
40 |
def rez = strWriter.toString() |
|
41 |
outputFile.withWriter("UTF-8") { writer -> |
|
42 |
writer.println('<?xml version="1.0" encoding="UTF-8"?>') |
|
43 |
writer.print(rez) |
|
44 |
} |
|
45 |
} |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/package.html (revision 911) | ||
---|---|---|
1 |
<html> |
|
2 |
<body> |
|
3 |
<p>TIGERSearch import module. This is a prototype that can only manage SRCMF TIGERSearch sources</p> |
|
4 |
</body> |
|
5 |
</html> |
|
0 | 6 |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/MasterReheader.groovy (revision 911) | ||
---|---|---|
1 |
#! /usr/bin/groovy |
|
2 |
package org.txm.importer.srcmf; |
|
3 |
|
|
4 |
/* |
|
5 |
* To change this template, choose Tools | Templates |
|
6 |
* and open the template in the editor. |
|
7 |
*/ |
|
8 |
|
|
9 |
// Set up globals |
|
10 |
// def masterFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos.xml') |
|
11 |
// def outputFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos2.xml') |
|
12 |
// def headerFile = new File('/home/tomr/Documents/Work/SRCMF/srcmf_ts/header_srcmf.xml') |
|
13 |
// def feats = [nt:['cat', 'type', 'coord'], t:['pos', 'form', 'q']] |
|
14 |
// def firstFeat = [t:'word', nt:'cat'] |
|
15 |
|
|
16 |
// Command-line entry point |
|
17 |
def cli = new CliBuilder( |
|
18 |
usage:'MasterReheader.groovy [options] master_file.xml header_file.xml' |
|
19 |
) |
|
20 |
cli.h(longOpt:'help', 'Prints this message.') |
|
21 |
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') |
|
22 |
cli.nt(longOpt:'ntfeats', args:1, argName:'ntfeats', 'NT features for which to provide value node.') |
|
23 |
cli.t(longOpt:'tfeats', args:1, argName:'tfeats', 'T features for which to provide value node.') |
|
24 |
cli.nt1(longOpt:'ntfeat1st', args:1, argName:'1st ntfeat', 'First NT feature listed in header.') |
|
25 |
cli.t1(longOpt:'tfeat1st', args:1, argName:'1st tfeat', 'First T feature listed in header.') |
|
26 |
options = cli.parse(args) |
|
27 |
if (options.arguments().size() == 2) { |
|
28 |
def masterFile = new File(options.arguments()[0]) |
|
29 |
def headerFile = new File(options.arguments()[1]) |
|
30 |
def masterFolder = masterFile.getCanonicalFile().getParent() |
|
31 |
def outputFile = null |
|
32 |
if (options.o) { |
|
33 |
outputFile = new File(options.o) |
|
34 |
} else { |
|
35 |
outputFile = new File(masterFolder, 'MasterReheader_out.xml') |
|
36 |
} |
|
37 |
def ntfirst = 'cat' |
|
38 |
if (options.nt1) { |
|
39 |
ntfirst = options.nt1 |
|
40 |
} |
|
41 |
def tfirst = 'word' |
|
42 |
if (options.t1) { |
|
43 |
tfirst = options.t1 |
|
44 |
} |
|
45 |
script( |
|
46 |
masterFile, headerFile, outputFile, |
|
47 |
['nt':options.nts, 't':options.ts], |
|
48 |
['nt':options.nt1, 't':options.t1] |
|
49 |
) |
|
50 |
} else { |
|
51 |
println 'Incorrect number of command line arguments... exiting' |
|
52 |
println cli.usage() |
|
53 |
} |
|
54 |
|
|
55 |
def script( |
|
56 |
File masterFile, File headerFile, File outputFile, HashMap feats, HashMap firstFeat |
|
57 |
) { |
|
58 |
// Load master and header files |
|
59 |
def master = new XmlParser().parse(masterFile) |
|
60 |
def header = new XmlParser().parse(headerFile) |
|
61 |
def masterFolder = masterFile.getCanonicalFile().getParent() |
|
62 |
|
|
63 |
// Set up locals |
|
64 |
def attrVal = [nt:[:], t:[:]] |
|
65 |
|
|
66 |
// Scan subcorpus files and build attribute lists. |
|
67 |
master.body.subcorpus.each { |
|
68 |
def subcorpusFile = new File (masterFolder, it.'@external'[5..-1]) |
|
69 |
def subcorpus = new XmlParser().parse(subcorpusFile) |
|
70 |
// Closure for t & nt nodes processing. |
|
71 |
def getvals = { node, type -> |
|
72 |
node.attributes().each { mEntry -> |
|
73 |
if (! attrVal[type].keySet().contains(mEntry.getKey())) { |
|
74 |
attrVal[type][mEntry.getKey()] = new HashSet() |
|
75 |
} |
|
76 |
attrVal[type][mEntry.getKey()].add(mEntry.getValue()) |
|
77 |
} |
|
78 |
} |
|
79 |
subcorpus.s.graph.terminals.t.each { getvals.call(it, 't') } |
|
80 |
subcorpus.s.graph.nonterminals.nt.each { getvals.call(it, 'nt') } |
|
81 |
} |
|
82 |
// Id isn't an attribute in the header. |
|
83 |
attrVal['t'].remove('id') |
|
84 |
attrVal['nt'].remove('id') |
|
85 |
// Remove old feature nodes in master file |
|
86 |
def oldFeatureNodes = master.head.annotation.feature |
|
87 |
while (oldFeatureNodes) { |
|
88 |
node = oldFeatureNodes.pop() |
|
89 |
node.parent().remove(node) |
|
90 |
} |
|
91 |
assert (! master.head.annotation.feature) |
|
92 |
// Check firstFeat was relevant |
|
93 |
['t', 'nt'].each { type -> |
|
94 |
if (! (attrVal[type].keySet().contains(firstFeat[type]))) { |
|
95 |
firstFeat[type] = attrVal[type].keySet().sort()[0] |
|
96 |
} |
|
97 |
} |
|
98 |
assert attrVal['t'].keySet().contains(firstFeat['t']) |
|
99 |
assert attrVal['nt'].keySet().contains(firstFeat['nt']) |
|
100 |
def featList = [:] |
|
101 |
['t', 'nt'].each { type -> |
|
102 |
featList[type] = [firstFeat[type]] |
|
103 |
featList[type].addAll(attrVal[type].keySet().findAll { it != firstFeat[type] }) |
|
104 |
} |
|
105 |
// Add new feature and value nodes |
|
106 |
['t', 'nt'].each { type -> |
|
107 |
featList[type].each { feat -> |
|
108 |
def fNode = new Node(master.head.annotation[0], 'feature', |
|
109 |
['domain':type.toUpperCase(), 'name':feat] |
|
110 |
) |
|
111 |
// Add value node if the node value is given in 'feats' |
|
112 |
if (feats[type].contains(feat)) { |
|
113 |
attrVal[type][feat].each { value -> |
|
114 |
assert header.'**'.feature |
|
115 |
assert header.'**'.feature[0].'@name' |
|
116 |
assert header.'**'.feature[0].'@domain' |
|
117 |
assert ['NT', 'T'].contains(header.'**'.feature[0].'@domain') |
|
118 |
def hFNode = header.'**'.feature.find { |
|
119 |
it.'@name' == feat && ( |
|
120 |
it.'@domain' == type.toUpperCase() || it.'@domain' == 'FREC' |
|
121 |
) |
|
122 |
} |
|
123 |
def vText = '[unknown]' |
|
124 |
if (hFNode && hFNode.value.find { it.'@name' == value }) { |
|
125 |
vText = hFNode.value.find { it.'@name' == value }.text() |
|
126 |
} |
|
127 |
new Node(fNode, 'value', ['name':value], vText) |
|
128 |
} |
|
129 |
} |
|
130 |
} |
|
131 |
} |
|
132 |
|
|
133 |
// Save to output_file |
|
134 |
outputFile.withWriter { writer -> |
|
135 |
writer << groovy.xml.XmlUtil.serialize(master) |
|
136 |
} |
|
137 |
} |
|
138 |
|
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/SrcmfImporter.groovy (revision 911) | ||
---|---|---|
1 |
/* |
|
2 |
* Calls all import scripts |
|
3 |
*/ |
|
4 |
|
|
5 |
/** |
|
6 |
* |
|
7 |
* @author tmr |
|
8 |
*/ |
|
9 |
|
|
10 |
// Command line form of import statements: |
|
11 |
// import MasterReheader |
|
12 |
// import PunctInjectImport |
|
13 |
// import SubcorpusDataInject |
|
14 |
// import TigerSubcorpus2Main |
|
15 |
// import java.util.logging.FileHandler |
|
16 |
// import javax.xml.parsers.DocumentBuilderFactory |
|
17 |
|
|
18 |
// TXM package statement |
|
19 |
package org.txm.importer.srcmf |
|
20 |
|
|
21 |
import java.util.logging.*; |
|
22 |
|
|
23 |
|
|
24 |
// Command line entry point |
|
25 |
def cli = new CliBuilder( |
|
26 |
usage:'SrcmfImport.groovy [options] tiger_master.xml xml_txm.xml header_file.xml' |
|
27 |
) |
|
28 |
cli.h(longOpt:'help', 'Prints this message.') |
|
29 |
options = cli.parse(args) |
|
30 |
if (options.arguments().size() != 3) { |
|
31 |
println 'Incorrect number of command line arguments... exiting' |
|
32 |
println cli.usage() |
|
33 |
System.exit(2) |
|
34 |
} |
|
35 |
|
|
36 |
def tigerFile = new File(options.arguments()[0]) |
|
37 |
def txmFile = new File(options.arguments()[1]) |
|
38 |
def headerFile = new File(options.arguments()[2]) |
|
39 |
def tigerXmlAll = doAllButPnc( |
|
40 |
tigerFile, |
|
41 |
txmFile, |
|
42 |
headerFile, |
|
43 |
txmFile.getAbsoluteFile().getParentFile().getParentFile() |
|
44 |
) |
|
45 |
doPnc(tigerXmlAll, txmFile) |
|
46 |
tigerXmlAll.delete() |
|
47 |
|
|
48 |
def doAllButPnc(File tigerFile, File txmFile, File headerFile, File binDir) { |
|
49 |
// Run pos injection script |
|
50 |
File txmSrcDir = txmFile.getAbsoluteFile().getParentFile() |
|
51 |
File tigerDir = new File(binDir, "tiger") |
|
52 |
tigerDir.mkdir() |
|
53 |
File masterpos = new File(tigerDir, "master_pos.xml") |
|
54 |
File xmltxm = txmSrcDir.listFiles()[0] |
|
55 |
File logFile = new File(binDir, "tiger.log") |
|
56 |
def sdi = new SubcorpusDataInject( |
|
57 |
xmltxm, |
|
58 |
new FileHandler(logFile.getAbsolutePath()), "vers" |
|
59 |
) |
|
60 |
sdi.processMaster(tigerFile, masterpos) |
|
61 |
// Run reheader script |
|
62 |
def reheader = new MasterReheader() |
|
63 |
File tmp = File.createTempFile("tmp", ".xml",tigerDir) |
|
64 |
def feats = ['nt':['cat', 'type', 'coord'], 't':['pos', 'form', 'q']] |
|
65 |
def firstFeat = ['nt':'cat', 't':'word'] |
|
66 |
reheader.script(masterpos, headerFile, tmp, feats, firstFeat) |
|
67 |
if (!tmp.exists()) { |
|
68 |
println "Error: reheader failed" |
|
69 |
} |
|
70 |
masterpos.delete() |
|
71 |
tmp.renameTo(masterpos) |
|
72 |
// Run merge master & subcorpus script |
|
73 |
def tigerXmlAll = new File(masterpos.getParentFile(), "TigerAll.xml") |
|
74 |
def mergescript = new TigerSubcorpus2Main() |
|
75 |
mergescript.script(masterpos, tigerXmlAll) |
|
76 |
return tigerXmlAll |
|
77 |
} |
|
78 |
|
|
79 |
def doPnc(File tigerXmlAll, File txmFile) { |
|
80 |
injector = new PunctInjectImport(tigerXmlAll, txmFile) |
|
81 |
injector.outputFile = new File(tigerXmlAll.getParentFile(), "TigerPnc.xml") |
|
82 |
injector.process() |
|
83 |
} |
tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/SubcorpusDataInject.groovy (revision 911) | ||
---|---|---|
1 |
#! /usr/bin/groovy |
|
2 |
package org.txm.importer.srcmf; |
|
3 |
/* |
|
4 |
* To change this template, choose Tools | Templates |
|
5 |
* and open the template in the editor. |
|
6 |
*/ |
|
7 |
|
|
8 |
import java.util.logging.Logger |
|
9 |
import java.util.logging.FileHandler |
|
10 |
import java.util.logging.SimpleFormatter |
|
11 |
import java.util.logging.Level |
|
12 |
|
|
13 |
|
|
14 |
class SubcorpusDataInject { |
|
15 |
// Globals |
|
16 |
def textForm = 'prose' |
|
17 |
def TEI = null |
|
18 |
def wNodes = [] |
|
19 |
def wNodesInQ = [] |
|
20 |
// Set up namespaces |
|
21 |
static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml') |
|
22 |
static nsTxm = new groovy.xml.Namespace('http://textometrie.org/1.0', 'txm') |
|
23 |
// Set up a logger |
|
24 |
static logger = Logger.getLogger('subcorpusDataInjectLog') |
|
25 |
// END GLOBALS |
|
26 |
|
|
27 |
SubcorpusDataInject ( |
|
28 |
File xmlTxmFile, |
|
29 |
FileHandler fh, |
|
30 |
String newTextForm |
|
31 |
) { |
|
32 |
// Reset global values |
|
33 |
if (newTextForm) { |
|
34 |
textForm = newTextForm |
|
35 |
} |
|
36 |
// Set up logger |
|
37 |
def formatter = new SimpleFormatter() |
|
38 |
logger.addHandler(fh) |
|
39 |
logger.setLevel(Level.ALL) |
|
40 |
fh.setFormatter(formatter) |
|
41 |
// Parse the XML-TXM file |
|
42 |
logger.log(Level.INFO, "Parsing XML-TXM file (may take a while).") |
|
43 |
TEI = new XmlParser().parse(xmlTxmFile) |
|
44 |
// Get text form from TEI header (if expressed) |
|
45 |
logger.log(Level.INFO, "Reading TEI header to detect text form.") |
|
46 |
if (TEI.teiHeader.profileDesc.textClass.catRef) { |
|
47 |
def catId = TEI.teiHeader.profileDesc.textClass.catRef.'@target'[0][1..-1] |
|
48 |
textForm = TEI.teiHeader.encodingDesc.'**'.category.find { |
|
49 |
it.'@id' == catId || it.attribute(nsXml.id) == catId |
|
50 |
}.catDesc[0].text() |
|
51 |
logger.log(Level.INFO, "Found text form '$textForm' in TEI header.") |
|
52 |
} else { |
|
53 |
logger.log(Level.INFO, "No text form in TEI header, assuming '$textForm'.") |
|
54 |
} |
|
55 |
// Convert Lbs to Property |
|
56 |
if (textForm == 'vers' || textForm == 'mixte') { |
|
57 |
logger.log(Level.INFO, "Converting <lb/> tags to a word property for later processing.") |
|
58 |
def undone = TEI.text.body |
|
59 |
def lastLb = false |
|
60 |
def lastWord = null |
|
61 |
while (undone) { |
|
62 |
def aNode = undone.remove(0) |
|
63 |
if (aNode.name().getLocalPart() == 'lb') { |
|
64 |
lastLb = true |
|
65 |
if (lastWord) { |
|
66 |
lastWord.'@lb' = 'end' |
|
67 |
} |
|
68 |
} else if (aNode.name().getLocalPart() == 'w') { |
|
69 |
if (!punctTest(aNode)) { |
|
70 |
if (lastLb) { |
|
71 |
aNode.'@lb' = 'start' |
|
72 |
} |
|
73 |
lastWord = aNode |
|
74 |
lastLb = false |
|
75 |
} |
|
76 |
} else { |
|
77 |
undone.addAll(0, aNode.children().findAll { it.getClass() == Node }) |
|
78 |
} |
|
79 |
} |
|
80 |
} |
|
81 |
// Build node lists |
|
82 |
logger.log(Level.INFO, "Building node lists.") |
|
83 |
wNodes = TEI.text.'**'.findAll { it.name().getLocalPart() == 'w' } |
|
84 |
wNodesInQ = TEI.text.'**'.q.'**'.findAll { it.name().getLocalPart() == 'w' } |
|
85 |
} |
|
86 |
|
|
87 |
static main(def args) { |
|
88 |
// Parse command line args |
|
89 |
def outputFile = 'test-out.xml' |
|
90 |
def logName = 'subcorpusDataInjectLog.log' |
|
91 |
def cli = new CliBuilder( |
|
92 |
usage:'SubcorpusDataInject.groovy [options] TXM_file.xml TS_file.xml' |
|
93 |
) |
|
94 |
cli.h(longOpt:'help', 'Prints this message') |
|
95 |
cli.m(longOpt:'master', 'Specifies input file is a TS master file.') |
|
96 |
cli.v(longOpt:'verse', 'Sets default text form to verse. Overridden by TEI header.') |
|
97 |
cli.l(longOpt:'logfile', args:1, argName:'logfile.log', 'Use given file for log.') |
|
98 |
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file') |
|
99 |
println args |
|
100 |
def options = cli.parse(args) |
|
101 |
// Call XML TXM loader |
|
102 |
if (options.arguments().size() == 2) { |
|
103 |
def arglist = [new File (options.arguments()[0])] |
|
104 |
if (options.logfile) { |
|
105 |
arglist.add(new FileHandler(options.logfile)) |
|
106 |
} else { |
|
107 |
arglist.add(new FileHandler(logName)) |
|
108 |
} |
|
109 |
if (options.v) { |
|
110 |
arglist.add('vers') |
|
111 |
} else { |
|
112 |
arglist.add('') |
|
113 |
} |
|
114 |
def sdi = new SubcorpusDataInject(arglist[0], arglist[1], arglist[2]) |
|
115 |
// Call process master or process subcorpus |
|
116 |
arglist = [new File(options.arguments()[1]).getCanonicalFile()] |
|
117 |
if (options.output) { |
|
118 |
arglist.add(new File (options.output)) |
|
119 |
} else { |
|
120 |
arglist.add(new File (outputFile)) |
|
121 |
} |
|
122 |
if (options.m) { |
|
123 |
sdi.processMaster(arglist[0], arglist[1]) |
|
124 |
} else { |
|
125 |
sdi.processSubcorpus(arglist[0], arglist[1]) |
|
126 |
} |
|
127 |
} else { |
|
128 |
println "Wrong number of arguments." |
|
129 |
println cli.usage() |
|
130 |
} |
|
131 |
} |
|
132 |
|
|
133 |
def processMaster (File masterFile, File outputFile) { |
|
134 |
// Runs the transform on each subcorpus listed in a master file. |
|
135 |
// Resaves the master file. |
|
136 |
logger.log(Level.INFO, "Loading corpus master file") |
|
137 |
def masterFolder = '' |
|
138 |
def outputFolder = '' |
|
139 |
if (masterFile.getParent()) { |
|
140 |
// If not cwd |
|
141 |
masterFolder = masterFile.getParent() |
|
142 |
} |
|
143 |
if (outputFile.getParent()) { |
|
144 |
// If not cwd |
|
145 |
outputFolder = outputFile.getParent() |
|
146 |
} |
|
147 |
def master = new XmlParser().parse(masterFile) |
|
148 |
// Process each subcorpus file |
|
149 |
master.body.subcorpus.each { |
|
150 |
processSubcorpus( |
|
151 |
new File (masterFolder, it.'@external'[5..-1]), |
|
152 |
new File (outputFolder, "${it.'@external'[5..-5]}_pos.xml") |
|
153 |
) |
|
154 |
it.'@external' = "${it.'@external'[0..-5]}_pos.xml" |
|
155 |
} |
|
156 |
logger.log(Level.INFO, "Saving new master file ${outputFile.getName()}") |
|
157 |
|
|
158 |
outputFile.withWriter { writer -> |
|
159 |
writer << groovy.xml.XmlUtil.serialize(master) |
|
160 |
} |
|
161 |
logger.log(Level.INFO, "Run reheader on this file!") |
|
162 |
} |
|
163 |
|
|
164 |
def processSubcorpus (File subcorpusFile, File outputFile) { |
|
165 |
// Parse Subcorpus file |
|
166 |
logger.log(Level.INFO, "Loading subcorpus file ${subcorpusFile.getName()}") |
|
167 |
def TS = new XmlParser().parse(subcorpusFile) |
|
168 |
// This is used as a stack |
|
169 |
def tNodes = TS.s.graph.terminals.t |
|
170 |
// This isn't |
|
171 |
def tNodesFixed = TS.s.graph.terminals.t |
|
172 |
|
|
173 |
// Build duplicata correspondance list |
|
174 |
logger.log(Level.INFO, "Building duplicata correspondance list") |
|
175 |
def duplIdMatches = tNodes.findAll {it.'@word' == '*'}.collect { dupl -> |
|
176 |
def duplId = dupl.'@id' |
|
177 |
def realId = dupl.parent().parent().nonterminals.nt.edge.find { |
|
178 |
it.'@idref' == duplId |
|
179 |
}.parent().secedge.find { |
|
180 |
it.'@label' == 'dupl' |
|
181 |
}.'@idref' |
|
182 |
[duplId, realId] |
|
183 |
} |
|
184 |
logger.log(Level.INFO, "${duplIdMatches.size()} duplicatas found.") |
|
185 |
|
|
186 |
// Add pos & form tags |
|
187 |
logger.log(Level.INFO, "Adding PoS and form tags to each t-node") |
|
188 |
// Get first wNode corresponding to a tNode |
|
189 |
def i = 0 |
|
190 |
def wIx = null |
|
191 |
while (i < tNodes.size()) { |
|
192 |
// Avoid trying to find "fake" first words, it's very inefficient |
|
193 |
if (tNodes[i].'@word' != '#') { |
|
194 |
wIx = wNodes.indexOf(wNodes.find { idmatch(tNodes[i], it) }) |
|
195 |
} |
|
196 |
if (wIx) { |
|
197 |
break |
|
198 |
} else { |
|
199 |
i++ |
|
200 |
} |
|
201 |
} |
|
202 |
assert wIx, "$i" |
|
203 |
/* |
|
204 |
* MAIN ITERATOR BEGINS HERE FOR POS INJECTION |
|
205 |
*/ |
|
206 |
// Iterate over tNodes. |
|
207 |
while (tNodes) { |
|
208 |
def thisTNode = tNodes.remove(0) |
|
209 |
def thisTNodeId = thisTNode.'@editionId' |
|
210 |
def wNode = null |
|
211 |
def q = false |
|
212 |
// Define the PoS / q / form injecting closure |
|
213 |
def injectPos = { |
|
214 |
// Q tag |
|
215 |
if (q) { |
|
216 |
thisTNode.'@q' = 'y' |
|
217 |
} else { |
|
218 |
thisTNode.'@q' = 'n' |
|
219 |
} |
|
220 |
// Form tag |
|
221 |
if (textForm == 'prose') { |
|
222 |
thisTNode.'@form' = 'prose' |
|
223 |
} else if ( |
|
224 |
textForm == 'vers' || |
|
225 |
( |
|
226 |
textForm == 'mixte' |
|
227 |
&& wNode.parent().parent().name().getLocalPart() == 'ab' |
|
228 |
) |
|
229 |
) { |
|
230 |
if (! wNode.'@lb') { |
|
231 |
thisTNode.'@form' = 'vers' |
|
232 |
} else if (wNode.'@lb' == 'start') { |
|
233 |
thisTNode.'@form' = 'vers_debut' |
|
234 |
} else if (wNode.'@lb' == 'end') { |
|
235 |
thisTNode.'@form' = 'vers_fin' |
|
236 |
} |
|
237 |
} |
|
238 |
// DEBUG TMR 22/08/2012 |
|
239 |
else { |
|
240 |
thisTNode.'@form' = 'prose' |
|
241 |
} |
|
242 |
// END DEBUG TMR 22/08/2012 |
|
243 |
// Find PoS tag in BFM file |
|
244 |
// Try txm:ana tags |
|
245 |
def txmAnaList = wNode[nsTxm.ana] |
|
246 |
// Then interp tags |
|
247 |
if (! txmAnaList) { |
|
248 |
txmAnaList = wNode.interp |
|
249 |
} |
|
250 |
// Try type="#pos_syn" |
|
251 |
if (txmAnaList.find {it.'@type' == '#pos_syn'}) { |
|
252 |
thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos_syn'}.text() |
|
253 |
// type attr of wNode |
|
254 |
} else if (! txmAnaList && wNode.'@type') { |
|
255 |
thisTNode.'@pos' = wNode.'@type' |
|
256 |
// type="#pos" |
|
257 |
} else if (txmAnaList.find { it.'@type' = '#pos' }) { |
|
258 |
thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos'}.text() |
|
259 |
// type="#fropos" |
|
260 |
} else if (txmAnaList.find { it.'@type' = '#fropos' }) { |
|
261 |
thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#fropos'}.text() |
|
262 |
} else { |
|
263 |
thisTNode.'@pos' = '--' |
|
264 |
} |
|
265 |
} |
|
266 |
// End of injecting closure. |
|
267 |
// Check whether wNode matches tNode |
|
268 |
if (thisTNode.'@word' == '#') { |
|
269 |
// Not expecting to find these in BFM file, all OK. |
|
270 |
thisTNode.'@pos' = '--' |
|
271 |
thisTNode.'@form' = '--' |
|
272 |
thisTNode.'@q' = '--' |
|
273 |
} else if ( idmatch(thisTNode, wNodes[wIx]) ) { |
|
274 |
// Perfect match |
|
275 |
wNode = wNodes[wIx] |
|
276 |
q = wNodesInQ.contains(wNode) |
|
277 |
injectPos.call() |
|
278 |
wIx++ |
|
279 |
} else if (thisTNode.'@word' == '*') { |
|
280 |
// Duplicata, use dupl match list to find correct wNode |
|
281 |
def duplIdMatch = duplIdMatches.find { it[0] == thisTNode.'@id' } |
|
282 |
if (duplIdMatch) { |
|
283 |
def matchTNode = tNodesFixed.find { it.'@id' == duplIdMatch[1] } |
|
284 |
// To enhance performance, I assume initially that the wNode matching the |
|
285 |
// duplicata is within +- 10 words of the current position in the |
|
286 |
// list. The wNodes list is very long, and searches over the entire |
|
287 |
// list are avoided. |
|
288 |
assert matchTNode, "$duplIdMatch" |
|
289 |
if (wIx + 10 < wNodes.size() && wIx - 10 > 0) { |
|
290 |
wNode = wNodes[wIx-10 .. wIx+10].find { |
|
291 |
idmatch(matchTNode, it) |
|
292 |
} |
|
293 |
} |
|
294 |
// Use whole list if it failed. |
|
295 |
if (! wNode) { |
|
296 |
wNode = wNodes.find { |
|
297 |
idmatch(matchTNode, it) |
|
298 |
} |
|
299 |
} |
|
300 |
q = wNodesInQ.contains(wNode) |
|
301 |
injectPos.call() |
|
302 |
// Don't update wIx value! |
|
303 |
} else { |
|
304 |
thisTNode.'@pos' = '--' |
|
305 |
thisTNode.'@form' = '--' |
|
306 |
thisTNode.'@q' = '--' |
|
307 |
logger.log( |
|
308 |
Level.WARNING, |
|
309 |
"<w/> node matching TS duplicata $thisTNodeId not found." |
|
310 |
) |
|
311 |
} |
|
312 |
} else if (punctTest(wNodes[wIx])) { |
|
313 |
// not expecting to find these in TS file; try same tNode again. |
|
314 |
tNodes.add(0, thisTNode) |
|
315 |
wIx++ |
|
316 |
} else if (! |
|
317 |
wNodes.find { |
|
318 |
idmatch (thisTNode, it) |
|
319 |
} |
|
320 |
) { |
|
321 |
// Check the SRCMF corpus hasn't acquired an extra word |
|
322 |
logger.log( |
|
323 |
Level.WARNING, |
|
324 |
"TS word $thisTNodeId (${thisTNode.'@word'}) is " + |
|
325 |
"absent from BFM file." |
|
326 |
) |
|
327 |
thisTNode.'@pos' = '--' |
|
328 |
thisTNode.'@form' = '--' |
|
329 |
thisTNode.'@q' = '--' |
|
330 |
} else { |
|
331 |
// it's doesn't match, it's not a duplicata, |
|
332 |
// it's not BFM punctuation, but it IS found somewhere in the BFM |
|
333 |
// corpus. |
|
334 |
logger.log( |
|
335 |
Level.WARNING, |
|
336 |
"Discontinuity in TS corpus before word $thisTNodeId " + |
|
337 |
"(${thisTNode.'@word'})." |
|
338 |
) |
|
339 |
wNode = wNodes.find { |
|
340 |
idmatch(thisTNode, it) |
|
341 |
} |
|
342 |
q = wNodesInQ.contains(wNode) |
|
343 |
injectPos.call() |
|
344 |
wIx = wNodes.indexOf(wNode) + 1 |
|
345 |
} |
|
346 |
} |
|
347 |
/* |
|
348 |
* End Main iteration |
|
349 |
*/ |
|
350 |
/******************* |
|
351 |
* Recalculate headpos |
|
352 |
*/ |
|
353 |
logger.log(Level.INFO, "Recalculating headpos.") |
|
354 |
TS.s.graph.nonterminals.nt.each { nt -> |
|
355 |
def a = nt.edge.findAll { it.'@label' == 'L' } |
|
356 |
if (a.size() == 0) { |
|
357 |
nt.'@headpos' = '--' |
|
358 |
} else if (a.size() == 1) { |
|
359 |
nt.'@headpos' = nt.parent().parent().terminals.t.find { |
|
360 |
it.'@id' == a[0].'@idref' |
|
361 |
}.'@pos' |
|
362 |
} else { |
|
363 |
for (def headpos : ['NOM', 'PRO', 'ADJ', 'VER', 'ADV', 'CON', 'PRE', 'DET']) { |
|
364 |
if ( |
|
365 |
a.each { edge -> |
|
366 |
nt.parent().parent().terminals.t.find { |
|
367 |
it.'@id' == edge.'@idref' |
|
368 |
}.'@pos' |
|
369 |
}.contains(headpos) |
|
370 |
) { |
|
371 |
nt.'@headpos' = headpos + '?' |
|
372 |
break |
|
373 |
} |
|
374 |
} |
|
375 |
} |
|
376 |
assert nt.'@headpos' |
|
377 |
} |
|
378 |
logger.log(Level.INFO, "Saving new subcorpus file ${outputFile.getName()}") |
|
379 |
|
|
380 |
outputFile.withWriter { writer -> |
|
381 |
writer << groovy.xml.XmlUtil.serialize(TS) |
|
382 |
} |
|
383 |
} |
|
384 |
// Subroutine self-test to establish whether a Groovy node is punctuation |
|
385 |
private punctTest(gNode) { |
|
386 |
if (gNode.'@type') { |
|
387 |
gNode.'@type'.toLowerCase().startsWith('pon') |
|
388 |
} else { |
|
389 |
if (gNode[nsTxm.ana]) { |
|
390 |
return gNode[nsTxm.ana].find {it.text().toLowerCase().startsWith('pon')} != null |
|
391 |
} else if (gNode.interp) { |
|
392 |
return gNode.interp.find {it.text().toLowerCase().startsWith('pon')} != null |
|
393 |
} else return false |
|
394 |
} |
|
395 |
} |
|
396 |
|
|
397 |
private idmatch (tNode, wNode) { |
|
398 |
// Matches a TS node with a <w/> node |
|
399 |
def idMatch = false |
|
400 |
def wNodeId = wNode.'@id' |
|
401 |
if (! wNodeId) { |
|
402 |
wNodeId = wNode.attribute(nsXml.id) |
|
403 |
} |
|
404 |
def tNodeId = tNode.'@editionId' |
|
405 |
def regex = /(\d+)([_a-z]+)?$/ |
|
406 |
def m1 = (tNodeId =~ regex) |
|
407 |
def m2 = (wNodeId =~ regex) |
|
408 |
if ( m1.find() && m2.find() ) { |
|
409 |
idMatch = (m1.group(1).toInteger() == m2.group(1).toInteger()) |
|
410 |
} else { |
|
411 |
logger.log( |
|
412 |
Level.WARNING, |
|
413 |
"Idmatch routine can't find integer part of one or both of the" + |
|
414 |
"paired Ids for nodes $tNode, $wNode" |
|
415 |
) |
|
416 |
} |
Formats disponibles : Unified diff