Revision 2376 tmp/org.txm.groovy.core/src/groovy/org/txm/macro/misc/RenameFilesMacro.groovy

RenameFilesMacro.groovy (revision 2376)
1
package org.txm.macro.misc
2 1
// STANDARD DECLARATIONS
2
package org.txm.macro.misc
3 3

  
4
import groovy.xml.QName
5

  
6
import java.nio.charset.Charset
7
import java.text.DecimalFormat
8
import org.txm.utils.xml.DomUtils;
9
import org.txm.importer.ValidateXml;
10
import groovy.util.XmlParser
11 4
import org.kohsuke.args4j.*
12 5
import groovy.transform.Field
13 6
import org.txm.rcp.swt.widget.parameters.*
14 7

  
15 8
// BEGINNING OF PARAMETERS
16
@Field @Option(name="rootDir", usage="The directory contains the 'orig' directory which contains the html files", widget="Folder", required=false, def="/path")
17
File rootDir = new File("");
18 9

  
19
@Field @Option(name="encoding", usage="HTML encoding", widget="String", required=false, def="iso-8859-1")
20
String encoding = "iso-8859-1" // HTML files encoding
10
@Field @Option(name="inputDirectory",usage="TXT directory", widget="Folder", required=false, def="txt")
11
File inputDirectory;
21 12

  
22
@Field @Option(name="debug", usage="show debug messages. Values = true, false", widget="String", required=false, def="false")
23
def debug = "true" // set true to debug the script
13
// **change this parameter**
14
@Field @Option(name="extension",usage="Regexp de l'extension des fichiers à modifier", widget="String", required=true, def='\\.txt')
15
def extension = "\\.trs"
24 16

  
17
// **change this parameter**
18
@Field @Option(name="find",usage="Expression régulière", widget="String", required=true, def='’')
19
def find = "Bobine"
20

  
21
// **change this parameter**
22
@Field @Option(name="replaceWith",usage="Chaîne de remplacement", widget="String", required=false, def='\'')
23
def replaceWith = ""
24

  
25 25
// Open the parameters input dialog box
26 26
if (!ParametersDialog.open(this)) return;
27 27

  
28
debug = ("true" == debug)
29 28
// END OF PARAMETERS
30 29

  
31
String corpusName = rootDir.getName()
32
File srcDir = new File(rootDir, "orig");
33
File outDir = new File(rootDir, "xhtml");
34
File outDir2 = new File(rootDir, corpusName);
35
File rejected = new File(rootDir, "duplicates");
36
File tmpDir = new File(rootDir, "tmp");
37

  
38
if (!srcDir.exists()) {
39
	println "STOP, srcDir does not exists $srcDir"
40
	return;
41
}
42

  
43
outDir.deleteDir()
44
outDir.mkdir()
45
outDir2.deleteDir()
46
outDir2.mkdir()
47
rejected.deleteDir()
48
rejected.mkdir()
49
tmpDir.deleteDir()
50
tmpDir.mkdir()
51

  
52
def allTags = new HashSet<String>();
53
def allStyles = new HashSet<String>();
54
def allClasses = new HashSet<String>();
55
def newPrefix = "Numéro de document : "
56
int itext = 1;
57
def formater = new DecimalFormat("0000");
58
int LIMITDIFF = 10
59
def metadatas = ["DocPublicationName", "DocHeader"]
60
def files = []
61
srcDir.eachFileMatch(~/.*\.html/){ htmlFile -> files << htmlFile}
62
files = files.sort()
63

  
64
def done = new HashSet<String>();
65
def ignored = []
66
def allTitles = [:]
67
def dones = [:]
68
def ignoreds = []
69

  
70
def getText(def node) {
71
	//if (debug) println "node: "+node
72
	String s = " ";
73
	if (node instanceof String) {
74
		s += " "+node
75
	} else {
76
		for (def c : node.children()) {
77
			s += " "+getText(c)
78
		}
79
	}
80
	//println " "+s.replace("\n", " ").trim();
81
	return " "+s.replace("\n", " ").trim();
82
}
83

  
84
println "Nb of HTML files: "+files.size()
85
for (File htmlFile : files) {
86
	println "Processing file $htmlFile"
87
	File tmpHTML = new File(tmpDir, htmlFile.getName())
88
	tmpHTML.withWriter("UTF-8") { writer -> 
89
		String txt = htmlFile.getText(encoding)
90
		txt = txt.replaceAll("<p></p>", " ");
91
		txt = txt.replaceAll("<p> </p>", " ");
92
		txt = txt.replaceAll("<br>", "<br> ");
93
		writer.write(txt)
94
	}
95
	
96
	String name = htmlFile.getName()
97
	name = name.substring(0, name.lastIndexOf("."));
98

  
99
	File xhtmlFile = new File(outDir, name+".xhtml")
100

  
101
	xhtmlFile.withWriter("UTF-8") { out ->
102
		def doc = org.jsoup.Jsoup.connect(tmpHTML.toURI().toURL().toString());
103
		println "current charset: "+doc.charset()
104
		doc.charset(Charset.forName("UTF-8"))
105
		println "current charset: "+doc.charset()
106
		out.println(doc.outerHtml​())
107
	}
108

  
109
	if (ValidateXml.test(xhtmlFile)) {
110
		def root = new XmlParser().parse(xhtmlFile)
111
		def tables = root.body.table.tbody.tr.td
112
		if (tables.size() == 0) tables = root.body.table.tr.td
113
		//println "Nb of txt : "+tables.size()
114

  
115
		for (def text : tables) {
116
			String sign = ""
117
		
118
			if (! text instanceof groovy.util.Node) { println "NOT NODE: "+text; continue; }
119
			//println "TEXT "
120
			//text.setName("text")
121
			boolean endOfText = false;
122
			def textMetadatas = [:]
123
			for (String metadata : metadatas) {
124
				textMetadatas[metadata] = ""
125
			}
126
			for (def subtable : text.table) text.remove(subtable)	
127
			
128
			for (def p : text.table.p) p.addChild(" ")		
129
			
130
			for (def child : text.span) {
131
				if ("color:red; font-weight:bold".equals(child.@style)) {
132
					//text.remove(child)
133
					if (debug) "Found bold: "+child
134
					child.replaceNode { node -> w(expFound: "y", child.text())}
135
				}
136
			}
137
			
138
			def startIgnoringText = false
139
			def tmp =""
140
			def ichar = 0
141
			String title = "";
142
			def ignoredText = ""
143
			def children = text.children()
144
			for (int idx = 0 ; idx < children.size() ; idx++) {
145
				
146
				def child  = children[idx]
147
				if (debug) println "child: $child"
148
			
149
				if (startIgnoringText) {
150
					if (debug) println "Ignoring text : "+ignoredText
151
					if (child instanceof String) ignoredText += child
152
					else ignoredText += child.text()
153
					
154
					def t = text.children().remove(idx);
155
					//if (tmp.length() > 0) println "removing : "+t
156
					idx--
157
					continue; // next child
158
				}
159
			
160
				if (child instanceof String) {
161
					//println " "+child
162
					ichar += child.length()
163
				} else {
164
					ichar += child.text().length()
165
					//		allTags.add(child.name().getLocalPart())
166
					//		allClasses.add(child.@class)
167
					//		allStyles.add(child.@style)
168
					def nn = child.name()
169
					
170
					try {nn = nn.getLocalPart()} catch(Exception e) {}
171
					switch (nn) {
172
						case "br": break;
173
						case "span":
174
						if (debug) println "Found span $child"
175
							String classV = child.@class
176
							String style = child.@style
177
							if (classV != null) {
178
								if (metadatas.contains(classV)) {
179
									textMetadatas[classV] = (textMetadatas[classV] + " " + child.text()).trim()
180
									text.remove(child);
181
									idx--
182
									//println "METADATA: "+classV + " = "+child.text().trim().length();
183
								} else if ("TitreArticleVisu" == classV) {
184
									title += getText(child)//.text().trim().replaceAll("\n", " ")+ " "
185
									child.name = new QName(child.name().getNamespaceURI(), "head", child.name().getPrefix())
186
								} else {
187
									println "UNKNOWED CLASS: "+classV + " = "+child.text().trim().length();
188
								}
189
							} else if (style != null) {
190
								if ("color:red; font-weight:bold".equals(style)) {
191
									//child.replaceNode { node -> w(expFound: "test")	}
192
									//println "KEYWORD: "+child.text().trim();
193
								} else {
194
									println "UNKNOWED STYLE: '"+style+"' = "+child.text().trim();
195
								}
196
							} else {
197
								println "UNKNOWED SPAN: "+child.text().trim();
198
							}
199
							break;
200

  
201
						case "a": break
202
						case "w": break;
203
						case "b":
204
							startIgnoringText = true;
205
							tmp = child.text()
206
							//if (tmp.length() > 0) println "START REMOVING FROM : "+tmp
207
							text.remove(child);
208
							idx--
209
							break;
210
						case "i": break;
211
						case "font":
212
							if (debug) println "Found font $child"
213
							String style = child.@style
214
							if ("font-style:italic;" == style) {
215
								if (debug) println "ITALIC: "+getText(child).trim();
216
								child.replaceNode { node -> i(getText(child))}
217
							} else if ("font-weight:bold;") {
218
								if (debug) println "BOLD: "+getText(child).trim();
219
								child.replaceNode { node -> b(getText(child))}
220
							} else {
221
								println "FSTYLE: '"+style+"' = "+getText(child).trim();
222
							}
223
							break;
224
						default: println child.name()
225
					}
226
				}
227
			}
228
			
229
			//rename td to text
230
			text.name = new QName(text.name().getNamespaceURI(), "text", text.name().getPrefix())
231
			
232
			//Write metadatas
233
			for( String metadata : metadatas) {
234
				text.attributes().put(metadata, textMetadatas[metadata])
235
				//sign+= " "+textMetadatas[metadata].trim()
236
			}
237
			
238
			// get document number
239
			ignoredText = ignoredText.replaceAll("\n", " ")
240
			int iNo = ignoredText.indexOf(newPrefix);
241
			//println ignoredText
242
			if (iNo >= 0) {
243
				String no = ignoredText.substring(iNo+newPrefix.length()).trim()
244
				text.attributes().put("idnews", no)
245
				//sign += " "+no
246
				text.attributes().put("date", no.substring(5,9)+"-"+no.substring(9,11)+"-"+no.substring(11,13))
247
				text.attributes().put("yyyymmdd", no.substring(5,13))
248
				text.attributes().put("yyyymm", no.substring(5,11))
249
				text.attributes().put("yyyy", no.substring(5,9))
250
				text.attributes().put("mm", no.substring(9,11))
251
				text.attributes().put("dd", no.substring(11,13))
252
			}
253
			
254
			//sign += " "+ichar
255
			sign += " "+title
256
			
257
			if (allTitles[title] == null) allTitles[title] = ichar
258
			if (Math.abs(allTitles[title] - ichar) > LIMITDIFF) {
259
				sign += " "+ichar
260
			}
261
			File xmlFile;
262
			if (done.contains(sign)) {
263
				ignored << sign
264
				xmlFile = new File(rejected, name+"_"+formater.format((itext++))+".xml")
265
				ignoreds << xmlFile.getName()
266
			} else {
267
				done << sign;
268
				xmlFile = new File(outDir2, name+"_"+formater.format((itext++))+".xml")
269
				dones[sign] = xmlFile
270
			}
271
			def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8")); //$NON-NLS-1$
272
				writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
273
				new XmlNodePrinter(writer).print(text)
274
		}
275
	}
276
}
277

  
278
if (ignored.size() > 0) {
279
	File ignoredFile = new File (rejected, "ignored.txt");
280
	ignoredFile.withWriter("UTF-8") { writer ->
281
		writer.println "TOTAL: "+ignored.size()
282
		for (int i = 0 ; i < ignored.size() ; i++) {
283
			def sign = ignored[i]
284
			writer.println "\n**DUPLICATE\n "
285
			writer.println "keeped="+dones[sign];
286
			writer.println "rejected="+ignoreds[i];
287
			writer.println "SIGN="+sign
288
			writer.println "\n"
289
		}
290
	}
291
	println "TOTAL IGNORED: "+ignored.size()
292
}
293

  
294
println "TOTAL TEXT: $itext"
295
if (!debug) {
296
	outDir.deleteDir()
297
	tmpDir.deleteDir()
298
}
30
println "In $inputDirectory..."
31
inputDirectory.eachFileMatch(~/.*$extension/) { file ->               // for each file matching extension
32
		println " renaming: "+file.getName()
33
		String name = file.getName()
34
		name = name.replaceAll(find, replaceWith)
35
		file.renameTo(new File(file.getParentFile(), name))
36
	}

Also available in: Unified diff