Révision 3238

tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/dth/AnnotateHobbesBiblicalReferencesMacro.groovy (revision 3238)
1
// Copyright © 2021 ENS de Lyon, CNRS, University of Franche-Comté
2
// Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
3
// @author sheiden
4
// @author mdecorde
5

  
6
// STANDARD DECLARATIONS
7
package org.txm.macro.projets.dth
8

  
9
import org.kohsuke.args4j.*
10
import groovy.transform.Field
11
import org.txm.rcp.swt.widget.parameters.*
12
import org.txm.rcp.editors.concordances.*
13
import org.txm.searchengine.cqp.corpus.CQPCorpus
14
import org.txm.searchengine.cqp.corpus.MainCorpus
15
import org.txm.functions.concordances.*
16
import org.txm.annotation.urs.*
17
import org.txm.concordance.core.functions.Concordance
18
import org.txm.concordance.rcp.editors.ConcordanceEditor
19
import visuAnalec.elements.Unite
20
import visuAnalec.vue.Vue
21
import org.txm.searchengine.cqp.CQPSearchEngine
22
import org.apache.commons.lang.StringUtils
23
import org.txm.macro.cqp.CQPUtils
24
import org.txm.searchengine.cqp.ICqiClient
25
import org.txm.utils.i18n.LangFormater
26

  
27
move_start = 0
28
move_end = 0
29
create_only_if_new = true
30
// Biblical References CQL Query
31
query = /[word="[1-2]\.?"]? @[word="Mat\.|Acts|Gen\.|Cor\.|chap\.|Deut\.|Exod\.|Sam\.|Rom\.|Luke|Matth\.|Iohn|Joh\.|Kings|Act\.|John|Prov\.|Tim\.|Heb\.|Mark|Pet\.|Psal\.|Gal\.|Tit\.|Isaiah|Levit\.|Ezek\.|Math\.|Col\.|Math|Numb\.|Numbers|Zach\.|Chron\.|Eph\.|Epist\.|Isay|Luk\.|Marke|Dan\.|Ephes\.|Esay|Ioh\.|Isai|Jo\.|Job|Josh\.|Mar\.|Proverbs|Psalm|Vers\.|Colos\.|Ierem\.|Iosh\.|Jer\.|Jerem\.|Judges|Micah|Rev\.|Rom|Thess\.|Titus|Ac\.|Act|Apoc\.|Apocalypse|Baruch|Chro\.|Coll\.|Deuteronomy|Eccles\.|Ecclesiastes|Ecclus\.|Esdras|Exo\.|Exod|Ezektel|Gen|Haggai|Iames|Ier\.|Iob\.|Ioel|Ioshua|Isa|Isa\.|Iud\.|Iude|Iudg\.|Joel|Joshua|Jud\.|Judg\.|Matt\.|Matthew|Num\.|Numb|Psa\.|Ruth|Timothy|Zach"] [pos="p"]? [word="[0-9].*"] ([word="[0-9].*|v.*|\p{P}" & word!="\)|]|:|\("])*/
32
bibRefs = "BibRefs"
33

  
34
bibBooks = [
35
"Ac.":"Acts",
36
"Act":"Acts",
37
"Act.":"Acts",
38
"Acts":"Acts",
39
"Apoc.":"Revelation",
40
"Apocalypse":"Revelation",
41
"Baruch":"Baruch",
42
"chap.":"/chap./",
43
"Chro.":"Chronicles",
44
"Chron.":"Chronicles",
45
"Col.":"Colossians",
46
"Coll.":"Colossians",
47
"Colos.":"Colossians",
48
"Cor.":"Corinthians",
49
"Dan.":"Daniel",
50
"Deut.":"Deuteronomy",
51
"Deuteronomy":"Deuteronomy",
52
"Eccles.":"Ecclesiastes",
53
"Ecclesiastes":"Ecclesiastes",
54
"Ecclus.":"Ecclesiasticus",
55
"Eph.":"Ephesians",
56
"Ephes.":"Ephesians",
57
"Epist.":"/Epist./",
58
"Esay":"Isaiah",
59
"Esdras":"Esdras",
60
"Exo.":"Exodus",
61
"Exod":"Exodus",
62
"Exod.":"Exodus",
63
"Ezek.":"Ezekiel",
64
"Gal.":"Galatians",
65
"Gen":"Genesis",
66
"Gen.":"Genesis",
67
"Haggai":"Haggai",
68
"Heb.":"Hebrews",
69
"Iames":"James",
70
"Ier.":"Jeremiah",
71
"Ierem.":"Jeremiah",
72
"Ioel":"Joel",
73
"Ioh.":"John",
74
"Iohn":"John",
75
"Iosh.":"Joshua",
76
"Ioshua":"Joshua",
77
"Isa":"Isaiah",
78
"Isa.":"Isaiah",
79
"Isai":"Isaiah",
80
"Isaiah":"Isaiah",
81
"Isay":"Isaiah",
82
"Iud.":"Jude",
83
"Iude":"Jude",
84
"Iudg.":"Judges",
85
"Jer.":"Jeremiah",
86
"Jerem.":"Jeremiah",
87
"Jo.":"John",
88
"Jo.":"Job",
89
"Job":"Job",
90
"Joel":"Joel",
91
"Joh.":"John",
92
"John":"John",
93
"Josh.":"Joshua",
94
"Joshua":"Joshua",
95
"Jud.":"Jude",
96
"Judg.":"Judges",
97
"Judges":"Judges",
98
"Kings":"Kings",
99
"Levit.":"Leviticus",
100
"Luk.":"Luke",
101
"Luke":"Luke",
102
"Mar.":"Mark",
103
"Mark":"Mark",
104
"Marke":"Mark",
105
"Mat.":"Matthew",
106
"Math":"Matthew",
107
"Math.":"Matthew",
108
"Matt.":"Matthew",
109
"Matth.":"Matthew",
110
"Matthew":"Matthew",
111
"Micah":"Micah",
112
"Num.":"Numbers",
113
"Numb":"Numbers",
114
"Numb.":"Numbers",
115
"Numbers":"Numbers",
116
"Pet.":"Peter",
117
"Prov.":"Proverbs",
118
"Proverbs":"Proverbs",
119
"Psa.":"Psalms",
120
"Psal.":"Psalms",
121
"Psalm":"Psalms",
122
"Rev.":"Revelation",
123
"Rom":"Romans",
124
"Rom.":"Romans",
125
"Ruth":"Ruth",
126
"Sam.":"Samuel",
127
"Thess.":"Thessalonians",
128
"Tim.":"Timothy",
129
"Timothy":"Timothy",
130
"Tit.":"Titus",
131
"Titus":"Titus",
132
"Vers.":"/Vers./",
133
"Zach":"Zechariah",
134
"Zach.":"Zechariah",
135
]
136

  
137
bibAbbr = [
138
"Ac.":"Acts",
139
"Act":"Acts",
140
"Act.":"Acts",
141
"Acts":"Acts",
142
"Apoc.":"Rev.",
143
"Apocalypse":"Rev.",
144
"Baruch":"Bar.",
145
"chap.":"/chap./",
146
"Chro.":"Chr.",
147
"Chron.":"Chr.",
148
"Col.":"Col.",
149
"Coll.":"Col.",
150
"Colos.":"Col.",
151
"Cor.":"Cor.",
152
"Dan.":"Dan.",
153
"Deut.":"Deut.",
154
"Deuteronomy":"Deut.",
155
"Eccles.":"Eccles.",
156
"Ecclesiastes":"Eccles.",
157
"Ecclus.":"Ecclus.",
158
"Eph.":"Eph.",
159
"Ephes.":"Eph.",
160
"Epist.":"/Epist./",
161
"Esay":"Isa.",
162
"Esdras":"Esdras",
163
"Exo.":"Exod.",
164
"Exod":"Exod.",
165
"Exod.":"Exod.",
166
"Ezek.":"Ezek.",
167
"Gal.":"Gal.",
168
"Gen":"Gen.",
169
"Gen.":"Gen.",
170
"Haggai":"Hag.",
171
"Heb.":"Heb.",
172
"Iames":"Jas.",
173
"Ier.":"Jer.",
174
"Ierem.":"Jer.",
175
"Ioel":"Joel",
176
"Ioh.":"John",
177
"Iohn":"John",
178
"Iosh.":"Josh.",
179
"Ioshua":"Josh.",
180
"Isa":"Isa.",
181
"Isa.":"Isa.",
182
"Isai":"Isa.",
183
"Isaiah":"Isa.",
184
"Isay":"Isa.",
185
"Iud.":"Jude",
186
"Iude":"Jude",
187
"Iudg.":"Judg.",
188
"Jer.":"Jer.",
189
"Jerem.":"Jer.",
190
"Jo.":"Kgs",
191
"Jo.":"Job",
192
"Job":"Job",
193
"Joel":"Joel",
194
"Joh.":"John",
195
"John":"John",
196
"Josh.":"Josh.",
197
"Joshua":"Josh.",
198
"Jud.":"Jude",
199
"Judg.":"Judg.",
200
"Judges":"Judg.",
201
"Kings":"Kgs",
202
"Levit.":"Lev.",
203
"Luk.":"Luke",
204
"Luke":"Luke",
205
"Mar.":"Mark",
206
"Mark":"Mark",
207
"Marke":"Mark",
208
"Mat.":"Matt.",
209
"Math":"Matt.",
210
"Math.":"Matt.",
211
"Matt.":"Matt.",
212
"Matth.":"Matt.",
213
"Matthew":"Matt.",
214
"Micah":"Mic.",
215
"Num.":"Num.",
216
"Numb":"Num.",
217
"Numb.":"Num.",
218
"Numbers":"Num.",
219
"Pet.":"Pet.",
220
"Prov.":"Prov.",
221
"Proverbs":"Prov.",
222
"Psa.":"Ps.",
223
"Psal.":"Ps.",
224
"Psalm":"Ps.",
225
"Rev.":"Rev.",
226
"Rom":"Rom.",
227
"Rom.":"Rom.",
228
"Ruth":"Ruth",
229
"Sam.":"Sam.",
230
"Thess.":"Thess.",
231
"Tim.":"Tim.",
232
"Timothy":"Tim.",
233
"Tit.":"Titus",
234
"Titus":"Titus",
235
"Vers.":"/Vers./",
236
"Zach":"Zech.",
237
"Zach.":"Zech.",
238
]
239

  
240
/*
241
println "bibAbbr lengths :"
242
println "len\tF"
243
bibAbbr.collect {  it.value }.sort().unique().collect {  it.length() }.sort().countBy { it }.each { println sprintf("%1d\t%2d", it.key, it.value) }
244
(5..8).each { max ->
245
	abbrs = bibAbbr.collect {  it.value }.sort().unique().findAll { it.length() == max }
246
	println max+" : "+abbrs
247
}
248
*/
249

  
250
// check for a corpus selection
251
utils = new CQPUtils()
252
corpusEngine = CQPSearchEngine.getCqiClient()
253

  
254
corpora = utils.getCorpora(this)
255
scriptName = this.class.getSimpleName()
256

  
257
if ((corpora == null) || corpora.size() > 1) {
258
	println "** $scriptName: please select a corpus in the Corpus view or provide a corpus name. Aborting."
259
	return false
260
}
261

  
262
corpus = corpora[0].getMainCorpus()
263
corpusName = corpus.getName()
264
wordProperty = corpus.getProperty("word")
265

  
266
if (!URSCorpora.isAnnotationStructureReady(corpus)) {
267
	println "** URS Annotation Structure of "+corpusName+" is not ready. Aborting."
268
	return
269
}
270

  
271
// check for corpus annotation structure unit types
272
def analecCorpus = URSCorpora.getCorpus(corpus)
273

  
274
// Composite Biblical Reference : number, book, chapters_verses_list, chapters_verses_form
275
def crType = "Composite Biblical Reference"
276
if (!analecCorpus.getStructure().getUnites().contains(crType)) {
277
	analecCorpus.getStructure().ajouterType(Unite.class, crType)
278
	analecCorpus.getStructure().ajouterProp(Unite.class, crType, "reference_form")
279
	analecCorpus.getStructure().ajouterProp(Unite.class, crType, "reference_id")
280
	analecCorpus.getStructure().ajouterProp(Unite.class, crType, "number")
281
	analecCorpus.getStructure().ajouterProp(Unite.class, crType, "book")
282
	analecCorpus.getStructure().ajouterProp(Unite.class, crType, "chapters_verses_form")
283
	analecCorpus.getStructure().ajouterProp(Unite.class, crType, "books_chapters_verses_list")
284
}
285

  
286
// Biblical Reference : number, book, chapter, verse
287
def rType = "Biblical Reference"
288
if (!analecCorpus.getStructure().getUnites().contains(rType)) {
289
	analecCorpus.getStructure().ajouterType(Unite.class, rType)
290
	analecCorpus.getStructure().ajouterProp(Unite.class, rType, "number")
291
	analecCorpus.getStructure().ajouterProp(Unite.class, rType, "book")
292
	analecCorpus.getStructure().ajouterProp(Unite.class, rType, "chapter")
293
	analecCorpus.getStructure().ajouterProp(Unite.class, rType, "verse")
294
	analecCorpus.getStructure().ajouterProp(Unite.class, rType, "raw_verses")
295
	analecCorpus.getStructure().ajouterProp(Unite.class, rType, "cr_id")
296
	analecCorpus.getStructure().ajouterProp(Unite.class, rType, "type")
297
}
298
 
299
// check for corpus annotation structure input form
300
def vue = URSCorpora.getVue(corpus)
301
if (!vue.getTypesAVoir(Unite.class).contains(crType)) {
302
	vue.ajouterType(Unite.class, crType)
303
	vue.ajouterProp(Unite.class, crType, "reference_id")
304
	vue.ajouterProp(Unite.class, crType, "reference_form")
305
	vue.ajouterProp(Unite.class, crType, "books_chapters_verses_list")
306
	vue.ajouterProp(Unite.class, crType, "chapters_verses_form")
307
	vue.ajouterProp(Unite.class, crType, "book")
308
	vue.ajouterProp(Unite.class, crType, "number")
309
}
310

  
311
if (!vue.getTypesAVoir(Unite.class).contains(rType)) {
312
	vue.ajouterType(Unite.class, rType)
313
	vue.ajouterProp(Unite.class, rType, "type")
314
	vue.ajouterProp(Unite.class, rType, "cr_id")
315
	vue.ajouterProp(Unite.class, rType, "raw_verses")
316
	vue.ajouterProp(Unite.class, rType, "verse")
317
	vue.ajouterProp(Unite.class, rType, "chapter")
318
	vue.ajouterProp(Unite.class, rType, "book")
319
	vue.ajouterProp(Unite.class, rType, "number")
320
}
321

  
322
// manage cqp matching strategy
323
cqp_matching_strategy = System.getProperty("cqp_matching_strategy")
324

  
325
if (!(cqp_matching_strategy == 'longest')) {
326
	println "Changing MatchingStrategy from '"+cqp_matching_strategy+"' to 'longest'"
327
	corpusEngine.query("set MatchingStrategy longest;")
328
	System.setProperty("cqp_matching_strategy", 'longest')
329
}
330

  
331
corpusEngine.cqpQuery(corpusName, bibRefs, query)
332
nmatches = corpusEngine.subCorpusSize("$corpusName:$bibRefs")
333

  
334
if (nmatches == 0) {
335
	println "** No references found. Aborting."
336
	return
337
}
338

  
339
println "Found "+nmatches+" biblical references."
340

  
341
println "Removing all <"+rType+"> units..."
342
analecCorpus.getUnites(rType).toArray(new Unite[0]).each { analecCorpus.supUnite(it) }
343

  
344
nRUnits = 0
345

  
346
def ScanChapsVers (book, num, chapsVers) {
347

  
348
	def lookingForChap = true
349
	def restIsVerse = false
350
	def CVList = []
351
	def maxVerse = 0
352
	def currentChap = 0
353
	def bookName
354
	
355
	if (num.length() > 0) {
356
		num += " "
357
	}
358
	
359
	if (bibAbbr.containsKey(book)) {
360
		bookName = num+bibAbbr[book]
361
	} else {
362
		println "** unkown book name: "+book
363
		bookName = num+"?"+book+"?"
364
	}
365

  
366
	chapsVers.each {
367

  
368
/*
369
15. vers. 22 , 23 , 24.			15.22,15.23,15.24		chap-VERS-verses
370
8. vers. 9.				8.9				chap-VERS-verse
371
4. v. 14				4.14				chap-V-verse
372
14. ver. 34.				14.34				chap-VER-verse
373
3. ver. 11 , 12 ,			3.11,3.12			chap-VER-verses
374
9. 9.					9.9				chap-verse
375
2. 13. 14 .				2.13,2.14			chap-verses
376
4. 41. 5. 26.				4.41,5.26			chap-verse-chap-verse
377
9. 13. 9. 21. 10. 12. 12. 19.		9.13,9.21,10.12,12.19		chaps-verses
378
9. 13. 9. 21. 10. 12. 12. 19. 2		9.13,9.21,10.12,12.19		chaps-verses-num
379
3. 11 8.				3.11,8				chap-verse-chap
380
5. 3 , 4 , & 5.				5.3,5.4,5.5			chap-verses-&-verse
381
5. 5. 7. 13 , 15. 27. 6. & 30. 25.	5.5,5.7,5.13,5.15,5.27,6,30.25	chap-verses-&-chap-verse
382
15.					15				chap
383
*/
384

  
385
		if (it != "," && it != "&" && it != "." && it != "[") {
386

  
387
			// remove trailing '.'
388
			it = (it ==~ /.*\./) ? it.substring(0, it.length()-1) : it
389

  
390
			if (restIsVerse) {
391
				if (it ==~ /[0-9].*/) {
392
					maxVerse = (it as Integer)
393
					CVList << [bookName, currentChap, (it as Integer)]
394
				}
395
			} else if (lookingForChap) {
396
				currentChap = (it as Integer)
397
				lookingForChap = false
398
			} else if (it ==~ /v.*/) {
399
				restIsVerse = true
400
			} else if ((it as Integer) > maxVerse) {
401
				maxVerse = (it as Integer)
402
				CVList << [bookName, currentChap, (it as Integer)]
403
			} else {
404
				currentChap = (it as Integer)
405
				maxVerse = 0
406
			}
407
		}
408
	}
409

  
410
	if (maxVerse == 0) {
411
		CVList << [bookName, currentChap, 0]
412
	}
413

  
414
//	return CVList.toString()
415
	return CVList
416

  
417
}
418

  
419
def starts = corpusEngine.dumpSubCorpus("$corpusName:$bibRefs", ICqiClient.CQI_CONST_FIELD_MATCH,    0, nmatches-1)
420
def targets = corpusEngine.dumpSubCorpus("$corpusName:$bibRefs", ICqiClient.CQI_CONST_FIELD_TARGET, 0, nmatches-1)
421
def ends   = corpusEngine.dumpSubCorpus("$corpusName:$bibRefs", ICqiClient.CQI_CONST_FIELD_MATCHEND, 0, nmatches-1)
422

  
423
[starts, targets, ends].transpose().each {
424
//[starts, targets, ends].transpose().collect {
425

  
426
		iUnit = 0
427

  
428
		if (it[0] < it[1]) { // number+book+chaps-verses
429
			def refForm = corpusEngine.cpos2Str("$corpusName.$wordProperty", (it[0]..it[2]) as int[]).join(' ')
430
			def number = corpusEngine.cpos2Str("$corpusName.$wordProperty", (it[0]..(it[1]-1)) as int[]).join(' ')
431
			number = (number ==~ /.*\./) ? number.substring(0, number.length()-1) : number
432
			def book = corpusEngine.cpos2Str("$corpusName.$wordProperty", (it[1]..it[1]) as int[]).join(' ')
433
			def chapsVersForms = corpusEngine.cpos2Str("$corpusName.$wordProperty", (it[1]+1)..it[2] as int[]).join(' ')
434
			def chapsVersFormsList = corpusEngine.cpos2Str("$corpusName.$wordProperty", (it[1]+1)..it[2] as int[]) as String[]
435
			def chapsVersList = ScanChapsVers(book, number, chapsVersFormsList)
436

  
437
			chapsVersList.reverse().each { b, c, v ->
438

  
439
				def properties = [:]
440
				properties["number"] = number
441
				properties["book"] = b
442
				properties["chapter"] = c as String
443
				properties["verse"] = v as String
444
				properties["raw_verses"] = LangFormater.format(chapsVersForms, corpus.getLang())
445
				properties["cr_id"] = 0 as String
446
				properties["type"] = "word"
447

  
448
				analecCorpus.addUniteSaisie(rType, it[0], it[2], properties)
449
				nRUnits++
450

  
451
			}
452

  
453
//		 	return refForm+'\t"'+refForm.replaceAll(/( )/,'"$1"')+'"\t'+number+'\t'+book+'\t'+chapsVersForms+'\t'+chapsVersList
454
//		 	return chapsVersList as String[]
455
		 	
456
		} else { // book+chaps-verses
457
			def refForm = corpusEngine.cpos2Str("$corpusName.$wordProperty", (it[0]..it[2]) as int[]).join(' ')
458
			def book = corpusEngine.cpos2Str("$corpusName.$wordProperty", (it[0]..it[0]) as int[]).join(' ')
459
			def chapsVersForms = corpusEngine.cpos2Str("$corpusName.$wordProperty", (it[0]+1)..it[2] as int[]).join(' ')
460
			def chapsVersFormsList = corpusEngine.cpos2Str("$corpusName.$wordProperty", (it[0]+1)..it[2] as int[]) as String[]
461
			def chapsVersList = ScanChapsVers(book, "", chapsVersFormsList)
462

  
463
			chapsVersList.reverse().each { b, c, v ->
464

  
465
				def properties = [:]
466
				properties["number"] = ""
467
				properties["book"] = b
468
				properties["chapter"] = c as String
469
				properties["raw_verses"] = LangFormater.format(chapsVersForms, corpus.getLang())
470
				properties["verse"] = v as String
471
				properties["cr_id"] = 0 as String
472
				properties["type"] = "word"
473

  
474
				analecCorpus.addUniteSaisie(rType, it[0], it[2], properties)
475
				nRUnits++
476

  
477
			}
478

  
479
//		 	return refForm+'\t"'+refForm.replaceAll(/( )/,'"$1"')+'"\t'+'\t'+book+'\t'+chapsVersForms+'\t'+chapsVersList
480
//		 	return chapsVersList as String[]
481
		}
482
	}
483
//	}.countBy { it }.sort { a,b -> -a.value <=> -b.value ?: a.key <=> b.key }.each {
484
//	}.flatten().countBy { it }.sort { a,b -> -a.value <=> -b.value ?: a.key <=> b.key }.each {
485
// 			println it.key+"\t"+it.value
486
//		}
487

  
488
if (!(cqp_matching_strategy == 'longest')) {
489
	println "Changing MatchingStrategy back to '"+cqp_matching_strategy+"'"
490
	corpusEngine.query("set MatchingStrategy "+cqp_matching_strategy+";")
491
	System.setProperty("cqp_matching_strategy", cqp_matching_strategy)
492
}
493

  
494
println "$nRUnits <$rType> units created."
495

  
496
print "Saving units... "
497
URSCorpora.saveCorpus(analecCorpus)
498
println "Done."
499

  
500
if (nRUnits > 0) corpus.setIsModified(true)
501

  

Formats disponibles : Unified diff