Révision 911
| tmp/org.txm.para.rcp/.classpath (revision 911) | ||
|---|---|---|
| 1 | 1 |
<?xml version="1.0" encoding="UTF-8"?> |
| 2 | 2 |
<classpath> |
| 3 | 3 |
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/> |
| 4 |
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/> |
|
| 4 |
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"> |
|
| 5 |
<accessrules> |
|
| 6 |
<accessrule kind="accessible" pattern="**"/> |
|
| 7 |
</accessrules> |
|
| 8 |
</classpathentry> |
|
| 5 | 9 |
<classpathentry kind="src" path="src"/> |
| 6 | 10 |
<classpathentry kind="output" path="bin"/> |
| 7 | 11 |
</classpath> |
| tmp/org.txm.para.rcp/META-INF/MANIFEST.MF (revision 911) | ||
|---|---|---|
| 1 | 1 |
Manifest-Version: 1.0 |
| 2 |
Require-Bundle: org.txm.utils;bundle-version="1.0.0";visibility:=reexp |
|
| 3 |
ort,org.eclipse.core.runtime;bundle-version="3.10.0";visibility:=reex |
|
| 4 |
port,org.eclipse.osgi;bundle-version="3.10.2";visibility:=reexport,or |
|
| 5 |
g.eclipse.ui;visibility:=reexport,org.txm.concordance.rcp;visibility: |
|
| 6 |
=reexport,org.txm.searchengine.cqp.core;bundle-version="1.1.0";visibi |
|
| 7 |
lity:=reexport,org.txm.rcp;bundle-version="0.7.8";visibility:=reexpor |
|
| 8 |
t,org.txm.core;bundle-version="0.7.0";visibility:=reexport,org.txm.pa |
|
| 9 |
ra.core;visibility:=reexport,org.txm.searchengine.core;bundle-version |
|
| 10 |
="1.0.0";visibility:=reexport |
|
| 2 |
Require-Bundle: org.txm.concordance.rcp;visibility:=reexport, |
|
| 3 |
org.txm.para.core;visibility:=reexport |
|
| 11 | 4 |
Export-Package: org.txm.para.rcp.editors, |
| 12 | 5 |
org.txm.para.rcp.handlers, |
| 13 | 6 |
org.txm.para.rcp.messages |
| tmp/org.txm.textsbalance.feature/feature.xml (revision 911) | ||
|---|---|---|
| 65 | 65 |
</license> |
| 66 | 66 |
|
| 67 | 67 |
<requires> |
| 68 |
<import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/> |
|
| 69 |
<import plugin="org.txm.searchengine.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 70 |
<import plugin="org.txm.rcp" version="0.7.7" match="greaterOrEqual"/> |
|
| 71 |
<import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/> |
|
| 72 |
<import plugin="org.eclipse.ui" version="3.106.1" match="greaterOrEqual"/> |
|
| 73 |
<import plugin="org.eclipse.jface.text"/> |
|
| 74 |
<import plugin="org.eclipse.ui.editors"/> |
|
| 75 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
| 76 |
<import plugin="org.txm.chartsengine.r.core"/> |
|
| 77 |
<import plugin="org.eclipse.core.expressions" version="3.4.600" match="greaterOrEqual"/> |
|
| 78 | 68 |
<import plugin="org.txm.textsbalance.core" version="1.0.0" match="greaterOrEqual"/> |
| 79 |
<import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 80 | 69 |
<import plugin="org.txm.chartsengine.rcp" version="1.0.0" match="greaterOrEqual"/> |
| 70 |
<import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/> |
|
| 71 |
<import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 72 |
<import plugin="org.txm.chartsengine.r.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 81 | 73 |
</requires> |
| 82 | 74 |
|
| 83 | 75 |
<plugin |
| tmp/org.txm.para.core/.classpath (revision 911) | ||
|---|---|---|
| 1 | 1 |
<?xml version="1.0" encoding="UTF-8"?> |
| 2 | 2 |
<classpath> |
| 3 | 3 |
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/> |
| 4 |
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/> |
|
| 4 |
<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"> |
|
| 5 |
<accessrules> |
|
| 6 |
<accessrule kind="accessible" pattern="**"/> |
|
| 7 |
</accessrules> |
|
| 8 |
</classpathentry> |
|
| 5 | 9 |
<classpathentry kind="src" path="src"/> |
| 6 | 10 |
<classpathentry kind="output" path="bin"/> |
| 7 | 11 |
</classpath> |
| tmp/org.txm.para.core/META-INF/MANIFEST.MF (revision 911) | ||
|---|---|---|
| 1 | 1 |
Manifest-Version: 1.0 |
| 2 |
Require-Bundle: org.txm.libs.groovy-all;bundle-version="2.3.3";visibil |
|
| 3 |
ity:=reexport,org.txm.utils;bundle-version="1.0.0";visibility:=reexpo |
|
| 4 |
rt,org.eclipse.osgi;bundle-version="3.10.2";visibility:=reexport,org. |
|
| 5 |
eclipse.core.runtime;bundle-version="3.10.0";visibility:=reexport,org |
|
| 6 |
.txm.searchengine.cqp.core;bundle-version="1.1.0";visibility:=reexpor |
|
| 7 |
t,org.txm.core;bundle-version="0.7.0";visibility:=reexport,org.eclips |
|
| 8 |
e.ui;visibility:=reexport,org.txm.searchengine.core;bundle-version="1 |
|
| 9 |
.0.0";visibility:=reexport |
|
| 2 |
Require-Bundle: org.txm.searchengine.cqp.core;bundle-version="1.1.0";visibility:=reexport |
|
| 10 | 3 |
Export-Package: org.txm.para.core.functions |
| 11 | 4 |
Bundle-ActivationPolicy: lazy |
| 12 | 5 |
Bundle-Version: 1.0.0.qualifier |
| tmp/org.txm.treetagger.files.feature/feature.xml (revision 911) | ||
|---|---|---|
| 69 | 69 |
</license> |
| 70 | 70 |
|
| 71 | 71 |
<requires> |
| 72 |
<import plugin="org.txm.utils"/> |
|
| 73 |
<import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/> |
|
| 74 | 72 |
<import plugin="org.txm.core" version="0.8.0" match="greaterOrEqual"/> |
| 75 | 73 |
<import plugin="org.txm.treetagger.core" version="1.0.0" match="greaterOrEqual"/> |
| 76 |
<import plugin="org.eclipse.ui"/> |
|
| 77 |
<import plugin="org.eclipse.swt"/> |
|
| 78 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
| 79 | 74 |
<import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/> |
| 80 |
<import plugin="org.txm.libs.groovy-all" version="2.3.3" match="greaterOrEqual"/> |
|
| 81 |
<import plugin="org.txm.utils" version="1.0.0" match="greaterOrEqual"/> |
|
| 82 | 75 |
</requires> |
| 83 | 76 |
|
| 84 | 77 |
<plugin |
| tmp/org.txm.analec.rcp/src/org/txm/analec/imports/AnalecAnnotationsImporter.java (revision 911) | ||
|---|---|---|
| 18 | 18 |
import org.apache.commons.lang.StringUtils; |
| 19 | 19 |
import org.eclipse.core.runtime.IProgressMonitor; |
| 20 | 20 |
import org.txm.Toolbox; |
| 21 |
import org.txm.importer.graal.PersonalNamespaceContext;
|
|
| 21 |
import org.txm.importer.PersonalNamespaceContext; |
|
| 22 | 22 |
import org.txm.searchengine.cqp.AbstractCqiClient; |
| 23 | 23 |
import org.txm.searchengine.cqp.CQPSearchEngine; |
| 24 | 24 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
| tmp/org.txm.analec.rcp/src/org/txm/analec/imports/DOMAnalecAnnotationsImporter.java (revision 911) | ||
|---|---|---|
| 8 | 8 |
|
| 9 | 9 |
import org.eclipse.core.runtime.IProgressMonitor; |
| 10 | 10 |
import org.txm.Toolbox; |
| 11 |
import org.txm.importer.graal.PersonalNamespaceContext;
|
|
| 11 |
import org.txm.importer.PersonalNamespaceContext; |
|
| 12 | 12 |
import org.txm.searchengine.cqp.AbstractCqiClient; |
| 13 | 13 |
import org.txm.searchengine.cqp.CQPSearchEngine; |
| 14 | 14 |
import org.txm.searchengine.cqp.corpus.MainCorpus; |
| tmp/org.txm.analec.rcp/src/org/txm/analec/export/AnalecAnnotationTEIExporter.java (revision 911) | ||
|---|---|---|
| 17 | 17 |
import org.txm.Toolbox; |
| 18 | 18 |
import org.txm.importer.StaxIdentityParser; |
| 19 | 19 |
import org.txm.importer.StaxStackWriter; |
| 20 |
import org.txm.importer.graal.PersonalNamespaceContext;
|
|
| 20 |
import org.txm.importer.PersonalNamespaceContext; |
|
| 21 | 21 |
import org.txm.objects.BaseParameters; |
| 22 | 22 |
import org.txm.rcp.Application; |
| 23 | 23 |
import org.txm.rcp.TxmPreferences; |
| tmp/org.txm.wordcloud.feature/feature.xml (revision 911) | ||
|---|---|---|
| 67 | 67 |
</license> |
| 68 | 68 |
|
| 69 | 69 |
<requires> |
| 70 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
| 71 |
|
|
| 72 |
<import plugin="org.txm.utils" version="1.0.0" match="greaterOrEqual"/> |
|
| 73 |
<import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/> |
|
| 74 |
<import plugin="org.txm.chartsengine.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 75 |
<import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/> |
|
| 76 |
<import plugin="org.txm.statsengine.r.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 77 | 70 |
<import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/> |
| 78 |
<import plugin="org.txm.statsengine.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 79 | 71 |
<import plugin="org.txm.chartsengine.r.core" version="1.0.0" match="greaterOrEqual"/> |
| 80 | 72 |
<import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/> |
| 81 |
<import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/> |
|
| 82 |
<import plugin="org.eclipse.osgi" version="3.10.2" match="greaterOrEqual"/> |
|
| 83 | 73 |
<import plugin="org.txm.index.rcp" version="1.0.0" match="greaterOrEqual"/> |
| 84 |
<import plugin="org.eclipse.ui" version="3.106.1" match="greaterOrEqual"/> |
|
| 85 | 74 |
<import plugin="org.txm.wordcloud.core" version="1.0.0" match="greaterOrEqual"/> |
| 86 | 75 |
<import plugin="org.txm.chartsengine.rcp"/> |
| 87 | 76 |
</requires> |
| tmp/org.txm.dictionary.feature/feature.xml (revision 911) | ||
|---|---|---|
| 17 | 17 |
</license> |
| 18 | 18 |
|
| 19 | 19 |
<requires> |
| 20 |
<import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/> |
|
| 21 |
<import plugin="org.txm.libs.groovy-all" version="2.3.3" match="greaterOrEqual"/> |
|
| 22 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
| 23 |
<import plugin="org.eclipse.ui"/> |
|
| 24 |
<import plugin="org.eclipse.core.runtime"/> |
|
| 25 |
<import plugin="org.txm.utils"/> |
|
| 26 |
<import plugin="org.eclipse.persistence.jpa" version="2.6.0" match="greaterOrEqual"/> |
|
| 27 |
<import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 28 | 20 |
<import plugin="org.txm.index.rcp" version="1.0.0" match="greaterOrEqual"/> |
| 29 |
<import plugin="javax.persistence"/> |
|
| 30 | 21 |
<import plugin="org.txm.annotation.kr.core" version="1.0.0" match="greaterOrEqual"/> |
| 31 | 22 |
</requires> |
| 32 | 23 |
|
| tmp/org.txm.wordcloud.rcp/META-INF/MANIFEST.MF (revision 911) | ||
|---|---|---|
| 1 | 1 |
Manifest-Version: 1.0 |
| 2 |
Require-Bundle: org.txm.rcp;bundle-version="0.7.8";visibility:=reexpor |
|
| 3 |
t,org.txm.utils;bundle-version="1.0.0";visibility:=reexport,org.eclip |
|
| 4 |
se.osgi;bundle-version="3.10.2";visibility:=reexport,org.txm.index.rc |
|
| 5 |
p;bundle-version="1.0.0";visibility:=reexport,org.eclipse.core.runtim |
|
| 6 |
e;bundle-version="3.10.0";visibility:=reexport,org.txm.chartsengine.c |
|
| 7 |
ore;bundle-version="1.0.0";visibility:=reexport,org.txm.index.core;bu |
|
| 8 |
ndle-version="1.0.0";visibility:=reexport,org.txm.searchengine.cqp.co |
|
| 9 |
re;bundle-version="1.1.0";visibility:=reexport,org.eclipse.ui;bundle- |
|
| 10 |
version="3.106.1";visibility:=reexport,org.txm.wordcloud.core;bundle- |
|
| 11 |
version="1.0.0";visibility:=reexport,org.txm.chartsengine.rcp;visibil |
|
| 12 |
ity:=reexport,org.txm.core;bundle-version="0.7.0";visibility:=reexpor |
|
| 13 |
t |
|
| 2 |
Require-Bundle: org.txm.index.rcp;bundle-version="1.0.0";visibility:=reexport, |
|
| 3 |
org.txm.wordcloud.core;bundle-version="1.0.0";visibility:=reexport, |
|
| 4 |
org.txm.chartsengine.rcp;visibility:=reexport |
|
| 14 | 5 |
Bundle-Vendor: Textometrie.org |
| 15 | 6 |
Bundle-ActivationPolicy: lazy |
| 16 | 7 |
Bundle-Version: 1.0.0.qualifier |
| tmp/org.txm.tigersearch.feature/feature.xml (revision 911) | ||
|---|---|---|
| 17 | 17 |
</license> |
| 18 | 18 |
|
| 19 | 19 |
<requires> |
| 20 |
<import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/> |
|
| 21 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
| 22 |
<import plugin="org.eclipse.ui"/> |
|
| 23 |
<import plugin="org.eclipse.core.runtime"/> |
|
| 24 |
<import plugin="org.eclipse.ui.editors" version="3.8.200" match="greaterOrEqual"/> |
|
| 25 |
<import plugin="org.eclipse.swt"/> |
|
| 26 |
<import plugin="org.txm.searchengine.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 27 |
<import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/> |
|
| 28 |
<import plugin="org.txm.statsengine.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 29 |
<import plugin="org.txm.statsengine.r.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 30 |
<import plugin="org.txm.statsengine.r.rcp" version="1.0.0" match="greaterOrEqual"/> |
|
| 31 |
<import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 32 | 20 |
<import plugin="org.txm.index.rcp" version="1.0.0" match="greaterOrEqual"/> |
| 33 |
<import plugin="org.txm.utils"/> |
|
| 34 |
<import plugin="org.txm.groovy.core" version="1.0.0" match="greaterOrEqual"/> |
|
| 35 | 21 |
</requires> |
| 36 | 22 |
|
| 37 | 23 |
<plugin |
| tmp/org.txm.cql2lsa.feature/feature.xml (revision 911) | ||
|---|---|---|
| 17 | 17 |
</license> |
| 18 | 18 |
|
| 19 | 19 |
<requires> |
| 20 |
<import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/> |
|
| 21 | 20 |
<import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/> |
| 22 |
<import plugin="org.eclipse.ui"/> |
|
| 23 |
<import plugin="org.eclipse.core.runtime"/> |
|
| 24 |
<import plugin="org.eclipse.ui.editors" version="3.8.100" match="greaterOrEqual"/> |
|
| 25 |
<import plugin="org.eclipse.core.expressions" version="3.4.500" match="greaterOrEqual"/> |
|
| 26 |
<import plugin="org.txm.index.core"/> |
|
| 27 |
<import plugin="org.txm.lexicaltable.core"/> |
|
| 28 |
<import plugin="org.txm.statsengine.r.core"/> |
|
| 29 | 21 |
</requires> |
| 30 | 22 |
|
| 31 | 23 |
<plugin |
| tmp/org.txm.tigersearch.rcp/.settings/org.eclipse.jdt.groovy.core.prefs (revision 911) | ||
|---|---|---|
| 1 | 1 |
eclipse.preferences.version=1 |
| 2 |
groovy.compiler.level=23 |
|
| 2 |
groovy.compiler.level=-1 |
|
| 3 |
groovy.script.filters=**/*.dsld,y,**/*.gradle,n |
|
| tmp/org.txm.tigersearch.rcp/META-INF/MANIFEST.MF (revision 911) | ||
|---|---|---|
| 131 | 131 |
ls.debugger,org.mozilla.javascript.tools.idswitch,org.mozilla.javascr |
| 132 | 132 |
ipt.tools.jsc,org.mozilla.javascript.tools.shell,org.relaxng.datatype |
| 133 | 133 |
,org.relaxng.datatype.helpers,org.txm.export.ts,org.txm.function.tige |
| 134 |
rsearch,org.txm.importer.srcmf,org.txm.importer.tigersearch,org.txm.s
|
|
| 134 |
rsearch,org.txm.s |
|
| 135 | 135 |
earchengine.ts,org.txm.test,org.txm.tigersearch.commands,org.txm.tige |
| 136 | 136 |
rsearch.editors,org.w3c.css.sac,org.w3c.css.sac.helpers,org.w3c.dom.s |
| 137 | 137 |
mil,org.w3c.dom.svg,tigerAPI,tigerAPI.converters,tigerAPI.theories.ho |
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/PunctInjectImport.groovy (revision 911) | ||
|---|---|---|
| 1 |
// Script to restore punctuation etc. to results of TS query. |
|
| 2 |
// Inputs: |
|
| 3 |
// --- Tiger-XML Document node |
|
| 4 |
// --- Java array: |
|
| 5 |
// --- --- String [index][type] where: |
|
| 6 |
// --- --- --- type == 0 gives the xml:id |
|
| 7 |
// --- --- --- type == 1 gives the word form |
|
| 8 |
// Process: |
|
| 9 |
// --- Injects punctuation. |
|
| 10 |
// Returns: |
|
| 11 |
// --- Tiger-XML Document node. |
|
| 12 |
|
|
| 13 |
// TXM package statement |
|
| 14 |
package org.txm.importer.srcmf |
|
| 15 |
|
|
| 16 |
import javax.xml.parsers.DocumentBuilderFactory |
|
| 17 |
|
|
| 18 |
|
|
| 19 |
|
|
| 20 |
/** |
|
| 21 |
* |
|
| 22 |
* @author tmr |
|
| 23 |
* |
|
| 24 |
*/ |
|
| 25 |
class PunctInjectImport |
|
| 26 |
{
|
|
| 27 |
|
|
| 28 |
static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
|
|
| 29 |
static nsTei = new groovy.xml.Namespace('http://www.tei-c.org/ns/1.0', 'tei')
|
|
| 30 |
def tigerXml = null |
|
| 31 |
def txmIdWordTableFixed = [] |
|
| 32 |
def outputFile = null |
|
| 33 |
|
|
| 34 |
/* |
|
| 35 |
* Call this constructor when no need of XML-TXM file for word ID table |
|
| 36 |
*/ |
|
| 37 |
PunctInjectImport(File tigerXmlFile, ArrayList txmIdWordTable) {
|
|
| 38 |
def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder() |
|
| 39 |
tigerXml = builder.parse(tigerXmlFile).documentElement |
|
| 40 |
txmIdWordTableFixed = txmIdWordTable |
|
| 41 |
outputFile = new File("output.xml")
|
|
| 42 |
} |
|
| 43 |
|
|
| 44 |
/* |
|
| 45 |
* Call this constructor when the XML-TXM file is needed for the word list. |
|
| 46 |
*/ |
|
| 47 |
PunctInjectImport(File tigerXmlFile, File xmlTxmFile) {
|
|
| 48 |
// |
|
| 49 |
def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder() |
|
| 50 |
tigerXml = builder.parse(tigerXmlFile).documentElement |
|
| 51 |
txmIdWordTableFixed = [] |
|
| 52 |
def xmlTxmGroovyDOM = new XmlParser().parse(xmlTxmFile) |
|
| 53 |
for (def w : xmlTxmGroovyDOM.text.'**'.findAll { it.name() == nsTei.'w' } ) {
|
|
| 54 |
def anId = null |
|
| 55 |
anId = w.'@id' |
|
| 56 |
if (! anId) {
|
|
| 57 |
anId = w.attribute(nsXml.id) |
|
| 58 |
} |
|
| 59 |
if (! anId) {
|
|
| 60 |
println "Error: ID attribute not located in XML TXM file" |
|
| 61 |
} |
|
| 62 |
txmIdWordTableFixed << [anId, getDeepText(w."txm:form"[0])] |
|
| 63 |
} |
|
| 64 |
// assert txmIdWordTableFixed.size() > 0 |
|
| 65 |
outputFile = new File("output.xml")
|
|
| 66 |
} |
|
| 67 |
|
|
| 68 |
def process () {
|
|
| 69 |
def txmIdWordTable = new ArrayList(this.txmIdWordTableFixed) |
|
| 70 |
def allTs = toList(this.tigerXml.getElementsByTagName('t'))
|
|
| 71 |
def tAttrs = getTAttrNames(allTs[0]) |
|
| 72 |
def tNodeIdPrefix = allTs[0].getAttribute('id').tokenize('#').first()
|
|
| 73 |
def addedWordIds = [] |
|
| 74 |
int tIx = 0 |
|
| 75 |
while (txmIdWordTable) {
|
|
| 76 |
def word = txmIdWordTable.remove(0) |
|
| 77 |
if (tIx == allTs.size()) {
|
|
| 78 |
// End of TS file, but still words left in the BFM file. |
|
| 79 |
addTNode(word, allTs.last(), tAttrs, tNodeIdPrefix, 'append') |
|
| 80 |
} |
|
| 81 |
else {
|
|
| 82 |
def tNode = allTs[tIx] |
|
| 83 |
def tId = getTNodeId(tNode) |
|
| 84 |
if (tId == word[0] && tNode.getAttribute('word') == word[1]) {
|
|
| 85 |
// alles gut |
|
| 86 |
tIx += 1 |
|
| 87 |
} |
|
| 88 |
else if (tId == word[0]) {
|
|
| 89 |
println("Mismatched Ids! ($tId)")
|
|
| 90 |
tIx += 1 |
|
| 91 |
} |
|
| 92 |
else if (['#', '*'].contains(tNode.getAttribute('word')) ) {
|
|
| 93 |
// SRCMF duplicata; try comparing word against the next tNode next time |
|
| 94 |
// around. |
|
| 95 |
txmIdWordTable.add(0, word) |
|
| 96 |
tIx += 1 |
|
| 97 |
} |
|
| 98 |
// Check that the SRCMF corpus doesn't have a bug in it... |
|
| 99 |
else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
|
|
| 100 |
&& (allTs[0..tIx - 1].find{
|
|
| 101 |
it.getAttribute('id') == "$tNodeIdPrefix#$tId"
|
|
| 102 |
})) {
|
|
| 103 |
println "Warning: word ${tNode.getAttribute('word')}, id $tId appears twice in corpus!"
|
|
| 104 |
txmIdWordTable.add(0, word) |
|
| 105 |
tIx += 1 |
|
| 106 |
} |
|
| 107 |
// Check that there's not an extra word in the SRCMF corpus (rare, usu. a tokenisation change) |
|
| 108 |
else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
|
|
| 109 |
&& (allTs[tIx..-1].find{
|
|
| 110 |
it.getAttribute('id') == "$tNodeIdPrefix#${word[0]}"
|
|
| 111 |
})) {
|
|
| 112 |
println "Warning: word ${tNode.getAttribute('word')}, id $tId does not appear in BFM!"
|
|
| 113 |
txmIdWordTable.add(0, word) |
|
| 114 |
tIx += 1 |
|
| 115 |
} |
|
| 116 |
else if (addedWordIds.contains(tId)) {
|
|
| 117 |
println "Warning: word ${tNode.getAttribute('word')}, id ${tId} out of sequence in SRCMF corpus!"
|
|
| 118 |
txmIdWordTable.add(0, word) |
|
| 119 |
tIx += 1 |
|
| 120 |
} |
|
| 121 |
else {
|
|
| 122 |
// Insert word. In the first instance, it will have the same parent as |
|
| 123 |
// the tNode before which it's being inserted. |
|
| 124 |
addTNode(word, allTs[tIx], tAttrs, tNodeIdPrefix, 'before') |
|
| 125 |
addedWordIds.add(word[0]) |
|
| 126 |
} |
|
| 127 |
} |
|
| 128 |
} |
|
| 129 |
// Second phase: move punctuation into previous sentence, |
|
| 130 |
// dependent on sequence. |
|
| 131 |
def allTerminalses = toList(this.tigerXml.getElementsByTagName('terminals'))
|
|
| 132 |
for (def i = 1 ; i < allTerminalses.size() ; i++) {
|
|
| 133 |
def ts = toList(allTerminalses[i].getElementsByTagName('t'))
|
|
| 134 |
def startPunc = true |
|
| 135 |
def puncStack = [] |
|
| 136 |
while (ts && startPunc) {
|
|
| 137 |
if ((ts[0].getAttribute('word') =~ /[A-zÀ-ÿ0-9]/).size() == 0) {
|
|
| 138 |
puncStack.add(ts.remove(0)) |
|
| 139 |
} |
|
| 140 |
else {
|
|
| 141 |
startPunc = false |
|
| 142 |
} |
|
| 143 |
} |
|
| 144 |
// Now, treat the punctuation stack at the beginning of the sentence |
|
| 145 |
if ( puncStack ) {
|
|
| 146 |
int moveLeft = 0 |
|
| 147 |
// First, identify LAST instance of sentence-final punctuation. |
|
| 148 |
def puncString = puncStack.collect{ it.getAttribute('word')[0] }.join('')
|
|
| 149 |
def matches = puncString =~ /[\.\,\;\:\!\?\)\]\}»”’]/ |
|
| 150 |
if (matches.size() > 0) {
|
|
| 151 |
moveLeft = puncString.lastIndexOf(matches[-1]) + 1 |
|
| 152 |
} |
|
| 153 |
// Second, split pairs of straight quotes |
|
| 154 |
matches = puncString =~ /(""|'')/ //"
|
|
| 155 |
if (matches.size() > 0) {
|
|
| 156 |
moveLeft = [moveLeft, puncString.lastIndexOf(matches[-1][0]) + 1].max() |
|
| 157 |
} |
|
| 158 |
// Now, move moveLeft punctuation nodes to the end of the prev. sentence |
|
| 159 |
ts = toList(allTerminalses[i].getElementsByTagName('t'))
|
|
| 160 |
for (def j = 0 ; j < moveLeft ; j++ ) {
|
|
| 161 |
allTerminalses[i - 1].appendChild(ts[j]) |
|
| 162 |
} |
|
| 163 |
} |
|
| 164 |
} |
|
| 165 |
outputFile.withWriter("UTF-8") { writer ->
|
|
| 166 |
writer.println(this.tigerXml) |
|
| 167 |
} |
|
| 168 |
} |
|
| 169 |
|
|
| 170 |
private addTNode(word, tNode, tAttrs, tNodeIdPrefix, where) {
|
|
| 171 |
def newTNode = tNode.getOwnerDocument().createElement('t')
|
|
| 172 |
for (def anAttr : tAttrs) {
|
|
| 173 |
if (anAttr == 'id') {
|
|
| 174 |
newTNode.setAttribute('id', "${tNodeIdPrefix}#${word[0]}")
|
|
| 175 |
} |
|
| 176 |
else if (anAttr == 'word') {
|
|
| 177 |
newTNode.setAttribute('word', word[1])
|
|
| 178 |
} |
|
| 179 |
else {
|
|
| 180 |
newTNode.setAttribute(anAttr, '--') |
|
| 181 |
} |
|
| 182 |
} |
|
| 183 |
if (where == 'before') {
|
|
| 184 |
tNode.getParentNode().insertBefore(newTNode, tNode) |
|
| 185 |
} |
|
| 186 |
else if (where == 'append') {
|
|
| 187 |
tNode.getParentNode().appendChild(newTNode) |
|
| 188 |
} |
|
| 189 |
else {
|
|
| 190 |
throw new IllegalArgumentException('Bad before value')
|
|
| 191 |
} |
|
| 192 |
} |
|
| 193 |
def getTAttrNames(tNode) {
|
|
| 194 |
def nodeMap = tNode.attributes |
|
| 195 |
def nameList = [] |
|
| 196 |
for ( def i = 0 ; i < nodeMap.getLength() ; i++ ) {
|
|
| 197 |
nameList.add( nodeMap.item(i).nodeName ) |
|
| 198 |
} |
|
| 199 |
return nameList |
|
| 200 |
} |
|
| 201 |
def getTNodeId(tNode) {
|
|
| 202 |
return tNode.getAttribute('id').tokenize('#').last()
|
|
| 203 |
} |
|
| 204 |
def toList(def iterable) {
|
|
| 205 |
return iterable.findAll {true};
|
|
| 206 |
} |
|
| 207 |
def getDeepText(def node) {
|
|
| 208 |
def str = "" |
|
| 209 |
node.children().each { child ->
|
|
| 210 |
if (child instanceof Node) {
|
|
| 211 |
str += getDeepText(child) |
|
| 212 |
} else if (child instanceof String) {
|
|
| 213 |
str += child |
|
| 214 |
} |
|
| 215 |
} |
|
| 216 |
return str |
|
| 217 |
} |
|
| 218 |
|
|
| 219 |
public static void main(String[] args) |
|
| 220 |
{
|
|
| 221 |
def cli = new CliBuilder( |
|
| 222 |
usage:'PunctInjectImport.groovy TigerXml.xml XmlTxm.xml' |
|
| 223 |
) |
|
| 224 |
cli.h(longOpt:'help', 'Prints this message.') |
|
| 225 |
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') |
|
| 226 |
def options = cli.parse(args) |
|
| 227 |
def tigerXmlFile = null |
|
| 228 |
def xmlTxmFile = null |
|
| 229 |
if (options.arguments().size() == 2) {
|
|
| 230 |
tigerXmlFile = new File(options.arguments()[0]) |
|
| 231 |
xmlTxmFile = new File(options.arguments()[1]) |
|
| 232 |
} else {
|
|
| 233 |
println 'Incorrect number of command line arguments... exiting' |
|
| 234 |
println cli.usage() |
|
| 235 |
System.exit(2) |
|
| 236 |
} |
|
| 237 |
def pii = new PunctInjectImport(tigerXmlFile, xmlTxmFile) |
|
| 238 |
if (options.o) {
|
|
| 239 |
pii.outputFile = new File(options.o) |
|
| 240 |
} |
|
| 241 |
pii.process() |
|
| 242 |
} |
|
| 243 |
} |
|
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/srcmfLoader.groovy (revision 911) | ||
|---|---|---|
| 1 |
package org.txm.importer.srcmf |
|
| 2 |
|
|
| 3 |
// srcdir |
|
| 4 |
// txml/master.xml |
|
| 5 |
// txml/*.xml |
|
| 6 |
// txm/*.xml |
|
| 7 |
|
|
| 8 |
// 1 import xml-txm |
|
| 9 |
import ims.tiger.gui.tigerregistry.TIGERRegistry; |
|
| 10 |
import ims.tiger.index.writer.IndexBuilderErrorHandler; |
|
| 11 |
import ims.tiger.index.writer.SimpleErrorHandler; |
|
| 12 |
import ims.tiger.index.writer.XMLIndexing; |
|
| 13 |
import ims.tiger.system.*; |
|
| 14 |
|
|
| 15 |
import javax.xml.stream.*; |
|
| 16 |
|
|
| 17 |
import java.io.BufferedWriter; |
|
| 18 |
import java.io.File; |
|
| 19 |
import java.io.FileOutputStream; |
|
| 20 |
import java.io.IOException; |
|
| 21 |
import java.io.OutputStreamWriter; |
|
| 22 |
import java.io.PrintStream; |
|
| 23 |
import java.net.URL; |
|
| 24 |
import java.util.HashMap; |
|
| 25 |
import java.util.logging.FileHandler |
|
| 26 |
import java.io.File; |
|
| 27 |
|
|
| 28 |
import org.apache.log4j.BasicConfigurator; |
|
| 29 |
import org.txm.export.ts.*; |
|
| 30 |
import org.txm.importer.ApplyXsl2; |
|
| 31 |
import org.txm.utils.xml.DomUtils; |
|
| 32 |
import org.txm.importer.ValidateXml; |
|
| 33 |
import org.txm.importer.XPathResult; |
|
| 34 |
import org.txm.importer.xmltxm.compiler; |
|
| 35 |
import org.txm.importer.xml.pager; |
|
| 36 |
import org.txm.objects.*; |
|
| 37 |
import org.txm.utils.*; |
|
| 38 |
import org.txm.*; |
|
| 39 |
import org.txm.scripts.teitxm.*; |
|
| 40 |
import org.txm.metadatas.*; |
|
| 41 |
import org.txm.utils.i18n.*; |
|
| 42 |
import org.w3c.dom.Element |
|
| 43 |
import org.xml.sax.SAXException; |
|
| 44 |
|
|
| 45 |
import javax.xml.parsers.DocumentBuilderFactory |
|
| 46 |
|
|
| 47 |
String userDir = System.getProperty("user.home");
|
|
| 48 |
boolean debug = false |
|
| 49 |
def MONITOR; |
|
| 50 |
BaseParameters params; |
|
| 51 |
try {params = paramsBinding;MONITOR=monitor} catch (Exception)
|
|
| 52 |
{ println "DEV MODE";//exception means we debug
|
|
| 53 |
debug = true |
|
| 54 |
params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml")) |
|
| 55 |
params.load() |
|
| 56 |
if (!org.txm.Toolbox.isInitialized()) {
|
|
| 57 |
Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
|
|
| 58 |
Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger")); |
|
| 59 |
Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models")); |
|
| 60 |
Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8"); |
|
| 61 |
Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ","); |
|
| 62 |
Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\""); |
|
| 63 |
Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
|
|
| 64 |
} |
|
| 65 |
} |
|
| 66 |
if (params == null) { println "no parameters. Aborting"; return; }
|
|
| 67 |
|
|
| 68 |
String corpusname = params.getCorpusName(); |
|
| 69 |
Element corpusElem = params.corpora.get(corpusname); |
|
| 70 |
String basename = params.name; |
|
| 71 |
String rootDir = params.rootDir; |
|
| 72 |
String lang = corpusElem.getAttribute("lang");
|
|
| 73 |
String model = lang |
|
| 74 |
String encoding = corpusElem.getAttribute("encoding");
|
|
| 75 |
boolean annotate = "true" == corpusElem.getAttribute("annotate");
|
|
| 76 |
String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
|
|
| 77 |
def xslParams = params.getXsltParams(corpusElem); |
|
| 78 |
|
|
| 79 |
File txmSrcDir = new File(rootDir, "xml-txm"); |
|
| 80 |
File tigerSrcDir = new File(rootDir, "tiger-xml"); |
|
| 81 |
File headerFile = new File(rootDir, "tiger-xml/header.xml"); |
|
| 82 |
|
|
| 83 |
File binDir = new File(Toolbox.getTXMHOMEPATH(),"corpora/"+basename); |
|
| 84 |
|
|
| 85 |
binDir.deleteDir(); |
|
| 86 |
binDir.mkdirs(); |
|
| 87 |
if (!binDir.exists()) {
|
|
| 88 |
println "Could not create binDir "+binDir |
|
| 89 |
return; |
|
| 90 |
} |
|
| 91 |
if (!txmSrcDir.exists()) {
|
|
| 92 |
println "Error: txm src dir does not exists: "+txmSrcDir |
|
| 93 |
return; |
|
| 94 |
} |
|
| 95 |
if (!tigerSrcDir.exists()) {
|
|
| 96 |
println "Error: tiger src dir does not exists: "+tigerSrcDir |
|
| 97 |
return; |
|
| 98 |
} |
|
| 99 |
File txmDir = new File(binDir,"txm/$corpusname"); |
|
| 100 |
txmDir.deleteDir(); |
|
| 101 |
txmDir.mkdirs(); |
|
| 102 |
// copy txm files |
|
| 103 |
List<File> srcfiles = txmSrcDir.listFiles(); |
|
| 104 |
for (File f : srcfiles) {// check XML format, and copy file into binDir
|
|
| 105 |
if (f.getName().equals("import.xml") || f.getName().equals("metadata.csv") || f.getName().endsWith(".properties"))
|
|
| 106 |
continue; |
|
| 107 |
if (ValidateXml.test(f)) {
|
|
| 108 |
FileCopy.copy(f, new File(txmDir, f.getName())); |
|
| 109 |
} else {
|
|
| 110 |
println "Won't process file "+f; |
|
| 111 |
} |
|
| 112 |
} |
|
| 113 |
if (txmDir.listFiles() == null) {
|
|
| 114 |
println "No txm file to process" |
|
| 115 |
return; |
|
| 116 |
} |
|
| 117 |
|
|
| 118 |
List<File> filelist = txmDir.listFiles(); |
|
| 119 |
Collections.sort(filelist); |
|
| 120 |
|
|
| 121 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 122 |
if (MONITOR != null) MONITOR.worked(25, "COMPILING") |
|
| 123 |
println "-- COMPILING - Building Search Engine indexes" |
|
| 124 |
println "binDir: $binDir" |
|
| 125 |
println "txmDir: $txmDir" |
|
| 126 |
def c = new compiler(); |
|
| 127 |
if (debug) c.setDebug(); |
|
| 128 |
c.setLang(lang); |
|
| 129 |
//c.setSortMetadata(sortMetadata) |
|
| 130 |
if (!c.run(binDir, txmDir, basename, corpusname, filelist)) {
|
|
| 131 |
println "import process stopped"; |
|
| 132 |
return; |
|
| 133 |
} |
|
| 134 |
|
|
| 135 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 136 |
if (MONITOR != null) MONITOR.worked(20, "EDITION") |
|
| 137 |
println "-- EDITION - Building edition" |
|
| 138 |
new File(binDir,"HTML/$corpusname").deleteDir(); |
|
| 139 |
new File(binDir,"HTML/$corpusname").mkdirs(); |
|
| 140 |
File outdir = new File(binDir,"/HTML/$corpusname/default/"); |
|
| 141 |
outdir.mkdirs(); |
|
| 142 |
|
|
| 143 |
|
|
| 144 |
def second = 0 |
|
| 145 |
println "Paginating text: " |
|
| 146 |
for (File srcfile : filelist) {
|
|
| 147 |
String txtname = srcfile.getName(); |
|
| 148 |
int i = txtname.lastIndexOf(".");
|
|
| 149 |
if(i > 0) txtname = txtname.substring(0, i); |
|
| 150 |
List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang); |
|
| 151 |
List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang); |
|
| 152 |
Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile); |
|
| 153 |
if (second) { print(", ") }
|
|
| 154 |
if (second > 0 && (second++ % 5) == 0) println "" |
|
| 155 |
print(srcfile.getName()); |
|
| 156 |
def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb"); |
|
| 157 |
Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html"); |
|
| 158 |
for (i = 0 ; i < ed.getPageFiles().size();) {
|
|
| 159 |
File f = ed.getPageFiles().get(i); |
|
| 160 |
String wordid = ed.getIdx().get(i); |
|
| 161 |
params.addPage(edition, ""+(++i), wordid); |
|
| 162 |
} |
|
| 163 |
} |
|
| 164 |
|
|
| 165 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 166 |
if (MONITOR != null) MONITOR.worked(20, "FINALIZING") |
|
| 167 |
File paramFile = new File(binDir, "import.xml"); |
|
| 168 |
DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true; |
|
| 169 |
|
|
| 170 |
|
|
| 171 |
// TMR Modifications start here |
|
| 172 |
File tigerDir = new File(binDir, "tiger"); |
|
| 173 |
tigerDir.mkdir(); |
|
| 174 |
File master = new File(tigerSrcDir, "master.xml"); |
|
| 175 |
File txmSrcFile = txmSrcDir.listFiles()[0]; // only one text file |
|
| 176 |
|
|
| 177 |
// Run SRCMF cmd line corpus import script |
|
| 178 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 179 |
def importer = new SrcmfImporter() |
|
| 180 |
def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir) |
|
| 181 |
|
|
| 182 |
// Create TigerXml-POS-PNC |
|
| 183 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 184 |
println "create TIGER XML POS PNC" |
|
| 185 |
PunctInjectImport injector; |
|
| 186 |
if (Toolbox.isInitialized()) {
|
|
| 187 |
// get idx from CQP |
|
| 188 |
// INCOMPLETE |
|
| 189 |
// def injector = new PunctInjectImport(tmpFile, words) |
|
| 190 |
println "Error: the loader is not ready to be launched from RCP" |
|
| 191 |
return; |
|
| 192 |
} else {
|
|
| 193 |
injector = new PunctInjectImport(tigerXmlAll, txmSrcFile) |
|
| 194 |
} |
|
| 195 |
injector.outputFile = new File(tigerDir, "TigerPnc.xml") |
|
| 196 |
injector.process() |
|
| 197 |
|
|
| 198 |
// Create the tigersearch.logprop file (used to launch TIGERSearch) |
|
| 199 |
File logprop = new File(tigerDir, "tigersearch.logprop"); |
|
| 200 |
logprop.withWriter("UTF-8") { writer ->
|
|
| 201 |
writer.write("""# Default log configuration of the TIGERSearch suite
|
|
| 202 |
|
|
| 203 |
log4j.rootLogger=WARN,Logfile |
|
| 204 |
log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO |
|
| 205 |
|
|
| 206 |
log4j.appender.Logfile=org.apache.log4j.RollingFileAppender |
|
| 207 |
log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log
|
|
| 208 |
log4j.appender.Logfile.MaxFileSize=500KB |
|
| 209 |
log4j.appender.Logfile.MaxBackupIndex=1 |
|
| 210 |
|
|
| 211 |
log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout |
|
| 212 |
log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""")
|
|
| 213 |
} |
|
| 214 |
|
|
| 215 |
// Run TigerRegistry |
|
| 216 |
if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
|
|
| 217 |
BasicConfigurator.configure(); |
|
| 218 |
String uri = tigerXmlAll.getAbsolutePath(); |
|
| 219 |
File tigerBinDir = new File(tigerDir, corpusname) |
|
| 220 |
tigerBinDir.mkdir() |
|
| 221 |
try {
|
|
| 222 |
IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath()); |
|
| 223 |
XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false); |
|
| 224 |
indexing.startIndexing(); |
|
| 225 |
} |
|
| 226 |
catch (IOException e) { System.out.println("IO: "+e.getMessage()); }
|
|
| 227 |
catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); }
|
|
| 228 |
|
|
| 229 |
tigerXmlAll.delete() |
|
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/TigerSubcorpus2Main.groovy (revision 911) | ||
|---|---|---|
| 1 |
/* |
|
| 2 |
Combines subcorpus files to give a single main file. |
|
| 3 |
*/ |
|
| 4 |
// TXM package statement |
|
| 5 |
package org.txm.importer.srcmf |
|
| 6 |
|
|
| 7 |
// Command-line entry point |
|
| 8 |
|
|
| 9 |
def cli = new CliBuilder( |
|
| 10 |
usage:'TigerSubcorpus2Main.groovy [options] tiger_master.xml' |
|
| 11 |
) |
|
| 12 |
cli.h(longOpt:'help', 'Prints this message.') |
|
| 13 |
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') |
|
| 14 |
options = cli.parse(args) |
|
| 15 |
if (options.arguments().size() != 1) {
|
|
| 16 |
println 'Incorrect number of command line arguments... exiting' |
|
| 17 |
println cli.usage() |
|
| 18 |
System.exit(2) |
|
| 19 |
} |
|
| 20 |
def tigerMaster = new File(options.arguments()[0]) |
|
| 21 |
def outputFile = new File('outputfile.xml')
|
|
| 22 |
if (options.o) {
|
|
| 23 |
outputFile = new File(options.o) |
|
| 24 |
} |
|
| 25 |
script(tigerMaster, outputFile) |
|
| 26 |
|
|
| 27 |
def script (File tigerMaster, File outputFile) {
|
|
| 28 |
def masterDoc = new XmlParser().parse(tigerMaster) |
|
| 29 |
for (def body : masterDoc.body) {
|
|
| 30 |
for (def subNode : body.subcorpus) {
|
|
| 31 |
File subCorpusFile = new File(tigerMaster.getParentFile(), (""+subNode.@external).substring(5))
|
|
| 32 |
def subDoc = new XmlParser().parse(subCorpusFile) |
|
| 33 |
body.remove(subNode) |
|
| 34 |
for (def child : subDoc.children()) |
|
| 35 |
body.append(child) |
|
| 36 |
} |
|
| 37 |
} |
|
| 38 |
def strWriter = new StringWriter() |
|
| 39 |
new groovy.util.XmlNodePrinter(new PrintWriter(strWriter)).print(masterDoc) |
|
| 40 |
def rez = strWriter.toString() |
|
| 41 |
outputFile.withWriter("UTF-8") { writer ->
|
|
| 42 |
writer.println('<?xml version="1.0" encoding="UTF-8"?>')
|
|
| 43 |
writer.print(rez) |
|
| 44 |
} |
|
| 45 |
} |
|
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/package.html (revision 911) | ||
|---|---|---|
| 1 |
<html> |
|
| 2 |
<body> |
|
| 3 |
<p>TIGERSearch import module. This is a prototype that can only manage SRCMF TIGERSearch sources</p> |
|
| 4 |
</body> |
|
| 5 |
</html> |
|
| 0 | 6 | |
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/MasterReheader.groovy (revision 911) | ||
|---|---|---|
| 1 |
#! /usr/bin/groovy |
|
| 2 |
package org.txm.importer.srcmf; |
|
| 3 |
|
|
| 4 |
/* |
|
| 5 |
* To change this template, choose Tools | Templates |
|
| 6 |
* and open the template in the editor. |
|
| 7 |
*/ |
|
| 8 |
|
|
| 9 |
// Set up globals |
|
| 10 |
// def masterFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos.xml')
|
|
| 11 |
// def outputFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos2.xml')
|
|
| 12 |
// def headerFile = new File('/home/tomr/Documents/Work/SRCMF/srcmf_ts/header_srcmf.xml')
|
|
| 13 |
// def feats = [nt:['cat', 'type', 'coord'], t:['pos', 'form', 'q']] |
|
| 14 |
// def firstFeat = [t:'word', nt:'cat'] |
|
| 15 |
|
|
| 16 |
// Command-line entry point |
|
| 17 |
def cli = new CliBuilder( |
|
| 18 |
usage:'MasterReheader.groovy [options] master_file.xml header_file.xml' |
|
| 19 |
) |
|
| 20 |
cli.h(longOpt:'help', 'Prints this message.') |
|
| 21 |
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.') |
|
| 22 |
cli.nt(longOpt:'ntfeats', args:1, argName:'ntfeats', 'NT features for which to provide value node.') |
|
| 23 |
cli.t(longOpt:'tfeats', args:1, argName:'tfeats', 'T features for which to provide value node.') |
|
| 24 |
cli.nt1(longOpt:'ntfeat1st', args:1, argName:'1st ntfeat', 'First NT feature listed in header.') |
|
| 25 |
cli.t1(longOpt:'tfeat1st', args:1, argName:'1st tfeat', 'First T feature listed in header.') |
|
| 26 |
options = cli.parse(args) |
|
| 27 |
if (options.arguments().size() == 2) {
|
|
| 28 |
def masterFile = new File(options.arguments()[0]) |
|
| 29 |
def headerFile = new File(options.arguments()[1]) |
|
| 30 |
def masterFolder = masterFile.getCanonicalFile().getParent() |
|
| 31 |
def outputFile = null |
|
| 32 |
if (options.o) {
|
|
| 33 |
outputFile = new File(options.o) |
|
| 34 |
} else {
|
|
| 35 |
outputFile = new File(masterFolder, 'MasterReheader_out.xml') |
|
| 36 |
} |
|
| 37 |
def ntfirst = 'cat' |
|
| 38 |
if (options.nt1) {
|
|
| 39 |
ntfirst = options.nt1 |
|
| 40 |
} |
|
| 41 |
def tfirst = 'word' |
|
| 42 |
if (options.t1) {
|
|
| 43 |
tfirst = options.t1 |
|
| 44 |
} |
|
| 45 |
script( |
|
| 46 |
masterFile, headerFile, outputFile, |
|
| 47 |
['nt':options.nts, 't':options.ts], |
|
| 48 |
['nt':options.nt1, 't':options.t1] |
|
| 49 |
) |
|
| 50 |
} else {
|
|
| 51 |
println 'Incorrect number of command line arguments... exiting' |
|
| 52 |
println cli.usage() |
|
| 53 |
} |
|
| 54 |
|
|
| 55 |
def script( |
|
| 56 |
File masterFile, File headerFile, File outputFile, HashMap feats, HashMap firstFeat |
|
| 57 |
) {
|
|
| 58 |
// Load master and header files |
|
| 59 |
def master = new XmlParser().parse(masterFile) |
|
| 60 |
def header = new XmlParser().parse(headerFile) |
|
| 61 |
def masterFolder = masterFile.getCanonicalFile().getParent() |
|
| 62 |
|
|
| 63 |
// Set up locals |
|
| 64 |
def attrVal = [nt:[:], t:[:]] |
|
| 65 |
|
|
| 66 |
// Scan subcorpus files and build attribute lists. |
|
| 67 |
master.body.subcorpus.each {
|
|
| 68 |
def subcorpusFile = new File (masterFolder, it.'@external'[5..-1]) |
|
| 69 |
def subcorpus = new XmlParser().parse(subcorpusFile) |
|
| 70 |
// Closure for t & nt nodes processing. |
|
| 71 |
def getvals = { node, type ->
|
|
| 72 |
node.attributes().each { mEntry ->
|
|
| 73 |
if (! attrVal[type].keySet().contains(mEntry.getKey())) {
|
|
| 74 |
attrVal[type][mEntry.getKey()] = new HashSet() |
|
| 75 |
} |
|
| 76 |
attrVal[type][mEntry.getKey()].add(mEntry.getValue()) |
|
| 77 |
} |
|
| 78 |
} |
|
| 79 |
subcorpus.s.graph.terminals.t.each { getvals.call(it, 't') }
|
|
| 80 |
subcorpus.s.graph.nonterminals.nt.each { getvals.call(it, 'nt') }
|
|
| 81 |
} |
|
| 82 |
// Id isn't an attribute in the header. |
|
| 83 |
attrVal['t'].remove('id')
|
|
| 84 |
attrVal['nt'].remove('id')
|
|
| 85 |
// Remove old feature nodes in master file |
|
| 86 |
def oldFeatureNodes = master.head.annotation.feature |
|
| 87 |
while (oldFeatureNodes) {
|
|
| 88 |
node = oldFeatureNodes.pop() |
|
| 89 |
node.parent().remove(node) |
|
| 90 |
} |
|
| 91 |
assert (! master.head.annotation.feature) |
|
| 92 |
// Check firstFeat was relevant |
|
| 93 |
['t', 'nt'].each { type ->
|
|
| 94 |
if (! (attrVal[type].keySet().contains(firstFeat[type]))) {
|
|
| 95 |
firstFeat[type] = attrVal[type].keySet().sort()[0] |
|
| 96 |
} |
|
| 97 |
} |
|
| 98 |
assert attrVal['t'].keySet().contains(firstFeat['t']) |
|
| 99 |
assert attrVal['nt'].keySet().contains(firstFeat['nt']) |
|
| 100 |
def featList = [:] |
|
| 101 |
['t', 'nt'].each { type ->
|
|
| 102 |
featList[type] = [firstFeat[type]] |
|
| 103 |
featList[type].addAll(attrVal[type].keySet().findAll { it != firstFeat[type] })
|
|
| 104 |
} |
|
| 105 |
// Add new feature and value nodes |
|
| 106 |
['t', 'nt'].each { type ->
|
|
| 107 |
featList[type].each { feat ->
|
|
| 108 |
def fNode = new Node(master.head.annotation[0], 'feature', |
|
| 109 |
['domain':type.toUpperCase(), 'name':feat] |
|
| 110 |
) |
|
| 111 |
// Add value node if the node value is given in 'feats' |
|
| 112 |
if (feats[type].contains(feat)) {
|
|
| 113 |
attrVal[type][feat].each { value ->
|
|
| 114 |
assert header.'**'.feature |
|
| 115 |
assert header.'**'.feature[0].'@name' |
|
| 116 |
assert header.'**'.feature[0].'@domain' |
|
| 117 |
assert ['NT', 'T'].contains(header.'**'.feature[0].'@domain') |
|
| 118 |
def hFNode = header.'**'.feature.find {
|
|
| 119 |
it.'@name' == feat && ( |
|
| 120 |
it.'@domain' == type.toUpperCase() || it.'@domain' == 'FREC' |
|
| 121 |
) |
|
| 122 |
} |
|
| 123 |
def vText = '[unknown]' |
|
| 124 |
if (hFNode && hFNode.value.find { it.'@name' == value }) {
|
|
| 125 |
vText = hFNode.value.find { it.'@name' == value }.text()
|
|
| 126 |
} |
|
| 127 |
new Node(fNode, 'value', ['name':value], vText) |
|
| 128 |
} |
|
| 129 |
} |
|
| 130 |
} |
|
| 131 |
} |
|
| 132 |
|
|
| 133 |
// Save to output_file |
|
| 134 |
outputFile.withWriter { writer ->
|
|
| 135 |
writer << groovy.xml.XmlUtil.serialize(master) |
|
| 136 |
} |
|
| 137 |
} |
|
| 138 |
|
|
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/SrcmfImporter.groovy (revision 911) | ||
|---|---|---|
| 1 |
/* |
|
| 2 |
* Calls all import scripts |
|
| 3 |
*/ |
|
| 4 |
|
|
| 5 |
/** |
|
| 6 |
* |
|
| 7 |
* @author tmr |
|
| 8 |
*/ |
|
| 9 |
|
|
| 10 |
// Command line form of import statements: |
|
| 11 |
// import MasterReheader |
|
| 12 |
// import PunctInjectImport |
|
| 13 |
// import SubcorpusDataInject |
|
| 14 |
// import TigerSubcorpus2Main |
|
| 15 |
// import java.util.logging.FileHandler |
|
| 16 |
// import javax.xml.parsers.DocumentBuilderFactory |
|
| 17 |
|
|
| 18 |
// TXM package statement |
|
| 19 |
package org.txm.importer.srcmf |
|
| 20 |
|
|
| 21 |
import java.util.logging.*; |
|
| 22 |
|
|
| 23 |
|
|
| 24 |
// Command line entry point |
|
| 25 |
def cli = new CliBuilder( |
|
| 26 |
usage:'SrcmfImport.groovy [options] tiger_master.xml xml_txm.xml header_file.xml' |
|
| 27 |
) |
|
| 28 |
cli.h(longOpt:'help', 'Prints this message.') |
|
| 29 |
options = cli.parse(args) |
|
| 30 |
if (options.arguments().size() != 3) {
|
|
| 31 |
println 'Incorrect number of command line arguments... exiting' |
|
| 32 |
println cli.usage() |
|
| 33 |
System.exit(2) |
|
| 34 |
} |
|
| 35 |
|
|
| 36 |
def tigerFile = new File(options.arguments()[0]) |
|
| 37 |
def txmFile = new File(options.arguments()[1]) |
|
| 38 |
def headerFile = new File(options.arguments()[2]) |
|
| 39 |
def tigerXmlAll = doAllButPnc( |
|
| 40 |
tigerFile, |
|
| 41 |
txmFile, |
|
| 42 |
headerFile, |
|
| 43 |
txmFile.getAbsoluteFile().getParentFile().getParentFile() |
|
| 44 |
) |
|
| 45 |
doPnc(tigerXmlAll, txmFile) |
|
| 46 |
tigerXmlAll.delete() |
|
| 47 |
|
|
| 48 |
def doAllButPnc(File tigerFile, File txmFile, File headerFile, File binDir) {
|
|
| 49 |
// Run pos injection script |
|
| 50 |
File txmSrcDir = txmFile.getAbsoluteFile().getParentFile() |
|
| 51 |
File tigerDir = new File(binDir, "tiger") |
|
| 52 |
tigerDir.mkdir() |
|
| 53 |
File masterpos = new File(tigerDir, "master_pos.xml") |
|
| 54 |
File xmltxm = txmSrcDir.listFiles()[0] |
|
| 55 |
File logFile = new File(binDir, "tiger.log") |
|
| 56 |
def sdi = new SubcorpusDataInject( |
|
| 57 |
xmltxm, |
|
| 58 |
new FileHandler(logFile.getAbsolutePath()), "vers" |
|
| 59 |
) |
|
| 60 |
sdi.processMaster(tigerFile, masterpos) |
|
| 61 |
// Run reheader script |
|
| 62 |
def reheader = new MasterReheader() |
|
| 63 |
File tmp = File.createTempFile("tmp", ".xml",tigerDir)
|
|
| 64 |
def feats = ['nt':['cat', 'type', 'coord'], 't':['pos', 'form', 'q']] |
|
| 65 |
def firstFeat = ['nt':'cat', 't':'word'] |
|
| 66 |
reheader.script(masterpos, headerFile, tmp, feats, firstFeat) |
|
| 67 |
if (!tmp.exists()) {
|
|
| 68 |
println "Error: reheader failed" |
|
| 69 |
} |
|
| 70 |
masterpos.delete() |
|
| 71 |
tmp.renameTo(masterpos) |
|
| 72 |
// Run merge master & subcorpus script |
|
| 73 |
def tigerXmlAll = new File(masterpos.getParentFile(), "TigerAll.xml") |
|
| 74 |
def mergescript = new TigerSubcorpus2Main() |
|
| 75 |
mergescript.script(masterpos, tigerXmlAll) |
|
| 76 |
return tigerXmlAll |
|
| 77 |
} |
|
| 78 |
|
|
| 79 |
def doPnc(File tigerXmlAll, File txmFile) {
|
|
| 80 |
injector = new PunctInjectImport(tigerXmlAll, txmFile) |
|
| 81 |
injector.outputFile = new File(tigerXmlAll.getParentFile(), "TigerPnc.xml") |
|
| 82 |
injector.process() |
|
| 83 |
} |
|
| tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/SubcorpusDataInject.groovy (revision 911) | ||
|---|---|---|
| 1 |
#! /usr/bin/groovy |
|
| 2 |
package org.txm.importer.srcmf; |
|
| 3 |
/* |
|
| 4 |
* To change this template, choose Tools | Templates |
|
| 5 |
* and open the template in the editor. |
|
| 6 |
*/ |
|
| 7 |
|
|
| 8 |
import java.util.logging.Logger |
|
| 9 |
import java.util.logging.FileHandler |
|
| 10 |
import java.util.logging.SimpleFormatter |
|
| 11 |
import java.util.logging.Level |
|
| 12 |
|
|
| 13 |
|
|
| 14 |
class SubcorpusDataInject {
|
|
| 15 |
// Globals |
|
| 16 |
def textForm = 'prose' |
|
| 17 |
def TEI = null |
|
| 18 |
def wNodes = [] |
|
| 19 |
def wNodesInQ = [] |
|
| 20 |
// Set up namespaces |
|
| 21 |
static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
|
|
| 22 |
static nsTxm = new groovy.xml.Namespace('http://textometrie.org/1.0', 'txm')
|
|
| 23 |
// Set up a logger |
|
| 24 |
static logger = Logger.getLogger('subcorpusDataInjectLog')
|
|
| 25 |
// END GLOBALS |
|
| 26 |
|
|
| 27 |
SubcorpusDataInject ( |
|
| 28 |
File xmlTxmFile, |
|
| 29 |
FileHandler fh, |
|
| 30 |
String newTextForm |
|
| 31 |
) {
|
|
| 32 |
// Reset global values |
|
| 33 |
if (newTextForm) {
|
|
| 34 |
textForm = newTextForm |
|
| 35 |
} |
|
| 36 |
// Set up logger |
|
| 37 |
def formatter = new SimpleFormatter() |
|
| 38 |
logger.addHandler(fh) |
|
| 39 |
logger.setLevel(Level.ALL) |
|
| 40 |
fh.setFormatter(formatter) |
|
| 41 |
// Parse the XML-TXM file |
|
| 42 |
logger.log(Level.INFO, "Parsing XML-TXM file (may take a while).") |
|
| 43 |
TEI = new XmlParser().parse(xmlTxmFile) |
|
| 44 |
// Get text form from TEI header (if expressed) |
|
| 45 |
logger.log(Level.INFO, "Reading TEI header to detect text form.") |
|
| 46 |
if (TEI.teiHeader.profileDesc.textClass.catRef) {
|
|
| 47 |
def catId = TEI.teiHeader.profileDesc.textClass.catRef.'@target'[0][1..-1] |
|
| 48 |
textForm = TEI.teiHeader.encodingDesc.'**'.category.find {
|
|
| 49 |
it.'@id' == catId || it.attribute(nsXml.id) == catId |
|
| 50 |
}.catDesc[0].text() |
|
| 51 |
logger.log(Level.INFO, "Found text form '$textForm' in TEI header.") |
|
| 52 |
} else {
|
|
| 53 |
logger.log(Level.INFO, "No text form in TEI header, assuming '$textForm'.") |
|
| 54 |
} |
|
| 55 |
// Convert Lbs to Property |
|
| 56 |
if (textForm == 'vers' || textForm == 'mixte') {
|
|
| 57 |
logger.log(Level.INFO, "Converting <lb/> tags to a word property for later processing.") |
|
| 58 |
def undone = TEI.text.body |
|
| 59 |
def lastLb = false |
|
| 60 |
def lastWord = null |
|
| 61 |
while (undone) {
|
|
| 62 |
def aNode = undone.remove(0) |
|
| 63 |
if (aNode.name().getLocalPart() == 'lb') {
|
|
| 64 |
lastLb = true |
|
| 65 |
if (lastWord) {
|
|
| 66 |
lastWord.'@lb' = 'end' |
|
| 67 |
} |
|
| 68 |
} else if (aNode.name().getLocalPart() == 'w') {
|
|
| 69 |
if (!punctTest(aNode)) {
|
|
| 70 |
if (lastLb) {
|
|
| 71 |
aNode.'@lb' = 'start' |
|
| 72 |
} |
|
| 73 |
lastWord = aNode |
|
| 74 |
lastLb = false |
|
| 75 |
} |
|
| 76 |
} else {
|
|
| 77 |
undone.addAll(0, aNode.children().findAll { it.getClass() == Node })
|
|
| 78 |
} |
|
| 79 |
} |
|
| 80 |
} |
|
| 81 |
// Build node lists |
|
| 82 |
logger.log(Level.INFO, "Building node lists.") |
|
| 83 |
wNodes = TEI.text.'**'.findAll { it.name().getLocalPart() == 'w' }
|
|
| 84 |
wNodesInQ = TEI.text.'**'.q.'**'.findAll { it.name().getLocalPart() == 'w' }
|
|
| 85 |
} |
|
| 86 |
|
|
| 87 |
static main(def args) {
|
|
| 88 |
// Parse command line args |
|
| 89 |
def outputFile = 'test-out.xml' |
|
| 90 |
def logName = 'subcorpusDataInjectLog.log' |
|
| 91 |
def cli = new CliBuilder( |
|
| 92 |
usage:'SubcorpusDataInject.groovy [options] TXM_file.xml TS_file.xml' |
|
| 93 |
) |
|
| 94 |
cli.h(longOpt:'help', 'Prints this message') |
|
| 95 |
cli.m(longOpt:'master', 'Specifies input file is a TS master file.') |
|
| 96 |
cli.v(longOpt:'verse', 'Sets default text form to verse. Overridden by TEI header.') |
|
| 97 |
cli.l(longOpt:'logfile', args:1, argName:'logfile.log', 'Use given file for log.') |
|
| 98 |
cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file') |
|
| 99 |
println args |
|
| 100 |
def options = cli.parse(args) |
|
| 101 |
// Call XML TXM loader |
|
| 102 |
if (options.arguments().size() == 2) {
|
|
| 103 |
def arglist = [new File (options.arguments()[0])] |
|
| 104 |
if (options.logfile) {
|
|
| 105 |
arglist.add(new FileHandler(options.logfile)) |
|
| 106 |
} else {
|
|
| 107 |
arglist.add(new FileHandler(logName)) |
|
| 108 |
} |
|
| 109 |
if (options.v) {
|
|
| 110 |
arglist.add('vers')
|
|
| 111 |
} else {
|
|
| 112 |
arglist.add('')
|
|
| 113 |
} |
|
| 114 |
def sdi = new SubcorpusDataInject(arglist[0], arglist[1], arglist[2]) |
|
| 115 |
// Call process master or process subcorpus |
|
| 116 |
arglist = [new File(options.arguments()[1]).getCanonicalFile()] |
|
| 117 |
if (options.output) {
|
|
| 118 |
arglist.add(new File (options.output)) |
|
| 119 |
} else {
|
|
| 120 |
arglist.add(new File (outputFile)) |
|
| 121 |
} |
|
| 122 |
if (options.m) {
|
|
| 123 |
sdi.processMaster(arglist[0], arglist[1]) |
|
| 124 |
} else {
|
|
| 125 |
sdi.processSubcorpus(arglist[0], arglist[1]) |
|
| 126 |
} |
|
| 127 |
} else {
|
|
| 128 |
println "Wrong number of arguments." |
|
| 129 |
println cli.usage() |
|
| 130 |
} |
|
| 131 |
} |
|
| 132 |
|
|
| 133 |
def processMaster (File masterFile, File outputFile) {
|
|
| 134 |
// Runs the transform on each subcorpus listed in a master file. |
|
| 135 |
// Resaves the master file. |
|
| 136 |
logger.log(Level.INFO, "Loading corpus master file") |
|
| 137 |
def masterFolder = '' |
|
| 138 |
def outputFolder = '' |
|
| 139 |
if (masterFile.getParent()) {
|
|
| 140 |
// If not cwd |
|
| 141 |
masterFolder = masterFile.getParent() |
|
| 142 |
} |
|
| 143 |
if (outputFile.getParent()) {
|
|
| 144 |
// If not cwd |
|
| 145 |
outputFolder = outputFile.getParent() |
|
| 146 |
} |
|
| 147 |
def master = new XmlParser().parse(masterFile) |
|
| 148 |
// Process each subcorpus file |
|
| 149 |
master.body.subcorpus.each {
|
|
| 150 |
processSubcorpus( |
|
| 151 |
new File (masterFolder, it.'@external'[5..-1]), |
|
| 152 |
new File (outputFolder, "${it.'@external'[5..-5]}_pos.xml")
|
|
| 153 |
) |
|
| 154 |
it.'@external' = "${it.'@external'[0..-5]}_pos.xml"
|
|
| 155 |
} |
|
| 156 |
logger.log(Level.INFO, "Saving new master file ${outputFile.getName()}")
|
|
| 157 |
|
|
| 158 |
outputFile.withWriter { writer ->
|
|
| 159 |
writer << groovy.xml.XmlUtil.serialize(master) |
|
| 160 |
} |
|
| 161 |
logger.log(Level.INFO, "Run reheader on this file!") |
|
| 162 |
} |
|
| 163 |
|
|
| 164 |
def processSubcorpus (File subcorpusFile, File outputFile) {
|
|
| 165 |
// Parse Subcorpus file |
|
| 166 |
logger.log(Level.INFO, "Loading subcorpus file ${subcorpusFile.getName()}")
|
|
| 167 |
def TS = new XmlParser().parse(subcorpusFile) |
|
| 168 |
// This is used as a stack |
|
| 169 |
def tNodes = TS.s.graph.terminals.t |
|
| 170 |
// This isn't |
|
| 171 |
def tNodesFixed = TS.s.graph.terminals.t |
|
| 172 |
|
|
| 173 |
// Build duplicata correspondance list |
|
| 174 |
logger.log(Level.INFO, "Building duplicata correspondance list") |
|
| 175 |
def duplIdMatches = tNodes.findAll {it.'@word' == '*'}.collect { dupl ->
|
|
| 176 |
def duplId = dupl.'@id' |
|
| 177 |
def realId = dupl.parent().parent().nonterminals.nt.edge.find {
|
|
| 178 |
it.'@idref' == duplId |
|
| 179 |
}.parent().secedge.find {
|
|
| 180 |
it.'@label' == 'dupl' |
|
| 181 |
}.'@idref' |
|
| 182 |
[duplId, realId] |
|
| 183 |
} |
|
| 184 |
logger.log(Level.INFO, "${duplIdMatches.size()} duplicatas found.")
|
|
| 185 |
|
|
| 186 |
// Add pos & form tags |
|
| 187 |
logger.log(Level.INFO, "Adding PoS and form tags to each t-node") |
|
| 188 |
// Get first wNode corresponding to a tNode |
|
| 189 |
def i = 0 |
|
| 190 |
def wIx = null |
|
| 191 |
while (i < tNodes.size()) {
|
|
| 192 |
// Avoid trying to find "fake" first words, it's very inefficient |
|
| 193 |
if (tNodes[i].'@word' != '#') {
|
|
| 194 |
wIx = wNodes.indexOf(wNodes.find { idmatch(tNodes[i], it) })
|
|
| 195 |
} |
|
| 196 |
if (wIx) {
|
|
| 197 |
break |
|
| 198 |
} else {
|
|
| 199 |
i++ |
|
| 200 |
} |
|
| 201 |
} |
|
| 202 |
assert wIx, "$i" |
|
| 203 |
/* |
|
| 204 |
* MAIN ITERATOR BEGINS HERE FOR POS INJECTION |
|
| 205 |
*/ |
|
| 206 |
// Iterate over tNodes. |
|
| 207 |
while (tNodes) {
|
|
| 208 |
def thisTNode = tNodes.remove(0) |
|
| 209 |
def thisTNodeId = thisTNode.'@editionId' |
|
| 210 |
def wNode = null |
|
| 211 |
def q = false |
|
| 212 |
// Define the PoS / q / form injecting closure |
|
| 213 |
def injectPos = {
|
|
| 214 |
// Q tag |
|
| 215 |
if (q) {
|
|
| 216 |
thisTNode.'@q' = 'y' |
|
| 217 |
} else {
|
|
| 218 |
thisTNode.'@q' = 'n' |
|
| 219 |
} |
|
| 220 |
// Form tag |
|
| 221 |
if (textForm == 'prose') {
|
|
| 222 |
thisTNode.'@form' = 'prose' |
|
| 223 |
} else if ( |
|
| 224 |
textForm == 'vers' || |
|
| 225 |
( |
|
| 226 |
textForm == 'mixte' |
|
| 227 |
&& wNode.parent().parent().name().getLocalPart() == 'ab' |
|
| 228 |
) |
|
| 229 |
) {
|
|
| 230 |
if (! wNode.'@lb') {
|
|
| 231 |
thisTNode.'@form' = 'vers' |
|
| 232 |
} else if (wNode.'@lb' == 'start') {
|
|
| 233 |
thisTNode.'@form' = 'vers_debut' |
|
| 234 |
} else if (wNode.'@lb' == 'end') {
|
|
| 235 |
thisTNode.'@form' = 'vers_fin' |
|
| 236 |
} |
|
| 237 |
} |
|
| 238 |
// DEBUG TMR 22/08/2012 |
|
| 239 |
else {
|
|
| 240 |
thisTNode.'@form' = 'prose' |
|
| 241 |
} |
|
| 242 |
// END DEBUG TMR 22/08/2012 |
|
| 243 |
// Find PoS tag in BFM file |
|
| 244 |
// Try txm:ana tags |
|
| 245 |
def txmAnaList = wNode[nsTxm.ana] |
|
| 246 |
// Then interp tags |
|
| 247 |
if (! txmAnaList) {
|
|
| 248 |
txmAnaList = wNode.interp |
|
| 249 |
} |
|
| 250 |
// Try type="#pos_syn" |
|
| 251 |
if (txmAnaList.find {it.'@type' == '#pos_syn'}) {
|
|
| 252 |
thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos_syn'}.text()
|
|
| 253 |
// type attr of wNode |
|
| 254 |
} else if (! txmAnaList && wNode.'@type') {
|
|
| 255 |
thisTNode.'@pos' = wNode.'@type' |
|
| 256 |
// type="#pos" |
|
| 257 |
} else if (txmAnaList.find { it.'@type' = '#pos' }) {
|
|
| 258 |
thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos'}.text()
|
|
| 259 |
// type="#fropos" |
|
| 260 |
} else if (txmAnaList.find { it.'@type' = '#fropos' }) {
|
|
| 261 |
thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#fropos'}.text()
|
|
| 262 |
} else {
|
|
| 263 |
thisTNode.'@pos' = '--' |
|
| 264 |
} |
|
| 265 |
} |
|
| 266 |
// End of injecting closure. |
|
| 267 |
// Check whether wNode matches tNode |
|
| 268 |
if (thisTNode.'@word' == '#') {
|
|
| 269 |
// Not expecting to find these in BFM file, all OK. |
|
| 270 |
thisTNode.'@pos' = '--' |
|
| 271 |
thisTNode.'@form' = '--' |
|
| 272 |
thisTNode.'@q' = '--' |
|
| 273 |
} else if ( idmatch(thisTNode, wNodes[wIx]) ) {
|
|
| 274 |
// Perfect match |
|
| 275 |
wNode = wNodes[wIx] |
|
| 276 |
q = wNodesInQ.contains(wNode) |
|
| 277 |
injectPos.call() |
|
| 278 |
wIx++ |
|
| 279 |
} else if (thisTNode.'@word' == '*') {
|
|
| 280 |
// Duplicata, use dupl match list to find correct wNode |
|
| 281 |
def duplIdMatch = duplIdMatches.find { it[0] == thisTNode.'@id' }
|
|
| 282 |
if (duplIdMatch) {
|
|
| 283 |
def matchTNode = tNodesFixed.find { it.'@id' == duplIdMatch[1] }
|
|
| 284 |
// To enhance performance, I assume initially that the wNode matching the |
|
| 285 |
// duplicata is within +- 10 words of the current position in the |
|
| 286 |
// list. The wNodes list is very long, and searches over the entire |
|
| 287 |
// list are avoided. |
|
| 288 |
assert matchTNode, "$duplIdMatch" |
|
| 289 |
if (wIx + 10 < wNodes.size() && wIx - 10 > 0) {
|
|
| 290 |
wNode = wNodes[wIx-10 .. wIx+10].find {
|
|
| 291 |
idmatch(matchTNode, it) |
|
| 292 |
} |
|
| 293 |
} |
|
| 294 |
// Use whole list if it failed. |
|
| 295 |
if (! wNode) {
|
|
| 296 |
wNode = wNodes.find {
|
|
| 297 |
idmatch(matchTNode, it) |
|
| 298 |
} |
|
| 299 |
} |
|
| 300 |
q = wNodesInQ.contains(wNode) |
|
| 301 |
injectPos.call() |
|
| 302 |
// Don't update wIx value! |
|
| 303 |
} else {
|
|
| 304 |
thisTNode.'@pos' = '--' |
|
| 305 |
thisTNode.'@form' = '--' |
|
| 306 |
thisTNode.'@q' = '--' |
|
| 307 |
logger.log( |
|
| 308 |
Level.WARNING, |
|
| 309 |
"<w/> node matching TS duplicata $thisTNodeId not found." |
|
| 310 |
) |
|
| 311 |
} |
|
| 312 |
} else if (punctTest(wNodes[wIx])) {
|
|
| 313 |
// not expecting to find these in TS file; try same tNode again. |
|
| 314 |
tNodes.add(0, thisTNode) |
|
| 315 |
wIx++ |
|
| 316 |
} else if (! |
|
| 317 |
wNodes.find {
|
|
| 318 |
idmatch (thisTNode, it) |
|
| 319 |
} |
|
| 320 |
) {
|
|
| 321 |
// Check the SRCMF corpus hasn't acquired an extra word |
|
| 322 |
logger.log( |
|
| 323 |
Level.WARNING, |
|
| 324 |
"TS word $thisTNodeId (${thisTNode.'@word'}) is " +
|
|
| 325 |
"absent from BFM file." |
|
| 326 |
) |
|
| 327 |
thisTNode.'@pos' = '--' |
|
| 328 |
thisTNode.'@form' = '--' |
|
| 329 |
thisTNode.'@q' = '--' |
|
| 330 |
} else {
|
|
| 331 |
// it's doesn't match, it's not a duplicata, |
|
| 332 |
// it's not BFM punctuation, but it IS found somewhere in the BFM |
|
| 333 |
// corpus. |
|
| 334 |
logger.log( |
|
| 335 |
Level.WARNING, |
|
| 336 |
"Discontinuity in TS corpus before word $thisTNodeId " + |
|
| 337 |
"(${thisTNode.'@word'})."
|
|
| 338 |
) |
|
| 339 |
wNode = wNodes.find {
|
|
| 340 |
idmatch(thisTNode, it) |
|
| 341 |
} |
|
| 342 |
q = wNodesInQ.contains(wNode) |
|
| 343 |
injectPos.call() |
|
| 344 |
wIx = wNodes.indexOf(wNode) + 1 |
|
| 345 |
} |
|
| 346 |
} |
|
| 347 |
/* |
|
| 348 |
* End Main iteration |
|
| 349 |
*/ |
|
| 350 |
/******************* |
|
| 351 |
* Recalculate headpos |
|
| 352 |
*/ |
|
| 353 |
logger.log(Level.INFO, "Recalculating headpos.") |
|
| 354 |
TS.s.graph.nonterminals.nt.each { nt ->
|
|
| 355 |
def a = nt.edge.findAll { it.'@label' == 'L' }
|
|
| 356 |
if (a.size() == 0) {
|
|
| 357 |
nt.'@headpos' = '--' |
|
| 358 |
} else if (a.size() == 1) {
|
|
| 359 |
nt.'@headpos' = nt.parent().parent().terminals.t.find {
|
|
| 360 |
it.'@id' == a[0].'@idref' |
|
| 361 |
}.'@pos' |
|
| 362 |
} else {
|
|
| 363 |
for (def headpos : ['NOM', 'PRO', 'ADJ', 'VER', 'ADV', 'CON', 'PRE', 'DET']) {
|
|
| 364 |
if ( |
|
| 365 |
a.each { edge ->
|
|
| 366 |
nt.parent().parent().terminals.t.find {
|
|
| 367 |
it.'@id' == edge.'@idref' |
|
| 368 |
}.'@pos' |
|
| 369 |
}.contains(headpos) |
|
| 370 |
) {
|
|
| 371 |
nt.'@headpos' = headpos + '?' |
|
| 372 |
break |
|
| 373 |
} |
|
| 374 |
} |
|
| 375 |
} |
|
| 376 |
assert nt.'@headpos' |
|
| 377 |
} |
|
| 378 |
logger.log(Level.INFO, "Saving new subcorpus file ${outputFile.getName()}")
|
|
| 379 |
|
|
| 380 |
outputFile.withWriter { writer ->
|
|
| 381 |
writer << groovy.xml.XmlUtil.serialize(TS) |
|
| 382 |
} |
|
| 383 |
} |
|
| 384 |
// Subroutine self-test to establish whether a Groovy node is punctuation |
|
| 385 |
private punctTest(gNode) {
|
|
| 386 |
if (gNode.'@type') {
|
|
| 387 |
gNode.'@type'.toLowerCase().startsWith('pon')
|
|
| 388 |
} else {
|
|
| 389 |
if (gNode[nsTxm.ana]) {
|
|
| 390 |
return gNode[nsTxm.ana].find {it.text().toLowerCase().startsWith('pon')} != null
|
|
| 391 |
} else if (gNode.interp) {
|
|
| 392 |
return gNode.interp.find {it.text().toLowerCase().startsWith('pon')} != null
|
|
| 393 |
} else return false |
|
| 394 |
} |
|
| 395 |
} |
|
| 396 |
|
|
| 397 |
private idmatch (tNode, wNode) {
|
|
| 398 |
// Matches a TS node with a <w/> node |
|
| 399 |
def idMatch = false |
|
| 400 |
def wNodeId = wNode.'@id' |
|
| 401 |
if (! wNodeId) {
|
|
| 402 |
wNodeId = wNode.attribute(nsXml.id) |
|
| 403 |
} |
|
| 404 |
def tNodeId = tNode.'@editionId' |
|
| 405 |
def regex = /(\d+)([_a-z]+)?$/ |
|
| 406 |
def m1 = (tNodeId =~ regex) |
|
| 407 |
def m2 = (wNodeId =~ regex) |
|
| 408 |
if ( m1.find() && m2.find() ) {
|
|
| 409 |
idMatch = (m1.group(1).toInteger() == m2.group(1).toInteger()) |
|
| 410 |
} else {
|
|
| 411 |
logger.log( |
|
| 412 |
Level.WARNING, |
|
| 413 |
"Idmatch routine can't find integer part of one or both of the" + |
|
| 414 |
"paired Ids for nodes $tNode, $wNode" |
|
| 415 |
) |
|
| 416 |
} |
|
Formats disponibles : Unified diff