Révision 2357

tmp/org.txm.libs.msoffice/src/org/txm/libs/msoffice/ReadExcel.java (revision 2357)
2 2

  
3 3
import java.io.BufferedOutputStream;
4 4
import java.io.File;
5
import java.io.FileNotFoundException;
6 5
import java.io.FileOutputStream;
7 6
import java.io.IOException;
8
import java.io.InputStream;
9
import java.io.OutputStreamWriter;
10 7
import java.text.SimpleDateFormat;
11 8
import java.util.ArrayList;
12 9
import java.util.Arrays;
13
import java.util.Collection;
14
import java.util.Collections;
15 10
import java.util.HashMap;
11
import java.util.HashSet;
12
import java.util.List;
16 13

  
17 14
import org.apache.poi.EncryptedDocumentException;
18 15
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
......
22 19
import org.apache.poi.ss.usermodel.Sheet;
23 20
import org.apache.poi.ss.usermodel.Workbook;
24 21
import org.apache.poi.ss.usermodel.WorkbookFactory;
25
import org.apache.poi.ss.util.SheetUtil;
22
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
26 23

  
27 24
public class ReadExcel {
28

  
25
	
29 26
	Workbook wb = null;
30 27
	Sheet ws = null;
31 28
	String[] header = null;
......
35 32
	private File tableFile;
36 33
	
37 34
	public ReadExcel(File tableFile, String sheetName) throws EncryptedDocumentException, InvalidFormatException, IOException {
38
		wb = WorkbookFactory.create(tableFile);
35
		
39 36
		this.tableFile = tableFile;
40 37
		
41
		if (sheetName == null || sheetName.length() == 0) {
42
			ws = wb.getSheetAt(0);
43
		} else {
44
			ws = wb.getSheet(sheetName);
45
			if (ws == null) {
38
		if (tableFile.exists() && tableFile.length() > 0) {
39
			wb = WorkbookFactory.create(tableFile);
40
			if (sheetName == null || sheetName.length() == 0) {
46 41
				ws = wb.getSheetAt(0);
42
			} else {
43
				ws = wb.getSheet(sheetName);
44
				if (ws == null) {
45
					ws = wb.getSheetAt(0);
46
				}
47 47
			}
48
			
49
			nRows = ws.getPhysicalNumberOfRows();
50
		} else {
51
			wb = new XSSFWorkbook();
52
			ws = wb.createSheet();
48 53
		}
49
		
50
		nRows = ws.getPhysicalNumberOfRows();
51 54
	}
52 55
	
53
	private void removeLines(HashMap<String, String> rules) {
56
	/**
57
	 * one rule (regex test) per column
58
	 * all lines are processed
59
	 * 
60
	 * @param rules String: column name to test -> regex to match
61
	 */
62
	private boolean removeLines(HashMap<String, String> rules) {
54 63
		Row headers = ws.getRow(0);
64
		if (headers == null) return false;
55 65
		int colMax = headers.getLastCellNum();
56 66
		
57 67
		ArrayList<Integer> columnIdxToTest = new ArrayList<Integer>();
......
67 77
			}
68 78
		}
69 79
		
80
		if (columnIdxToTest.size() == 0) {
81
			System.out.println("Error: no column found with name="+rules.keySet());
82
			return false;
83
		}
84
		
85
		ArrayList<Integer> removed = new ArrayList<Integer>();
70 86
		for (int rowIndex = 0 ; rowIndex < nRows ; rowIndex++) {
71 87
			Row row = ws.getRow(rowIndex);
72
			
88
			if (row == null) continue;
73 89
			for (int i = 0 ; i < columnIdxToTest.size() ; i++) {
74 90
				int colIndex = columnIdxToTest.get(i);
75 91
				
76 92
				Cell cell = row.getCell(colIndex);
77 93
				if (cell != null) {
78 94
					String value = cellToString(cell).trim();
79
					if (columnsTest.get(i).equals(value)) {
80
						cell.setCellValue(rules.get("TODOOOOOOOOOOOOOOOOO"));
95
					if (value.matches(columnsTest.get(i))) {
96
						removed.add(0, rowIndex);
97
						ws.removeRow(row);
98
						break; // next row
81 99
					}
82 100
				}
83 101
			}
84 102
		}
103
		
104
		// TODO very slow when there is a lot of lines
105
		for (int rowIndex : removed) {
106
			ws.shiftRows(rowIndex+1, nRows, -1);
107
			nRows--;
108
		}
109
		
110
		return true;
85 111
	}
86 112
	
87
	private void renameColumns(HashMap<String, String> rules) {
113
	/**
114
	 * extract lines and columns following rules.
115
	 * 
116
	 * don't forget to save the ReadeExcel object
117
	 * 
118
	 * @param excel2
119
	 * @param lineRules
120
	 * @param columnsSelection
121
	 * @return
122
	 * @throws EncryptedDocumentException
123
	 * @throws InvalidFormatException
124
	 * @throws IOException
125
	 */
126
	public boolean extractTo(ReadExcel excel2, HashMap<String, String> lineRules, List<String> columnsSelection) throws EncryptedDocumentException, InvalidFormatException, IOException {
127
		Sheet ws2 = excel2.ws;
128
		if (!extractTo(ws2, lineRules, columnsSelection)) {
129
			return false;
130
		}
131
		
132
		return true;
133
	}
134
	
135
	/**
136
	 * one rule (regex test) per column
137
	 * all lines are processed
138
	 * 
139
	 * @param lineRules String: column name to test -> regex to match
140
	 * @throws IOException 
141
	 * @throws InvalidFormatException 
142
	 * @throws EncryptedDocumentException 
143
	 */
144
	public boolean extractTo(Sheet ws2, HashMap<String, String> lineRules, List<String> columnsSelection) throws EncryptedDocumentException, InvalidFormatException, IOException {
88 145
		Row headers = ws.getRow(0);
146
		if (headers == null) return false;
147
		
89 148
		int colMax = headers.getLastCellNum();
149
		
150
		ArrayList<Integer> columnIdxToTest = new ArrayList<Integer>();
151
		ArrayList<Integer> columnIdxToWrite = new ArrayList<Integer>();
152
		ArrayList<String> columns = new ArrayList<String>();
153
		ArrayList<String> columnsTest = new ArrayList<String>();
90 154
		for (int colIndex = 0 ; colIndex < colMax ; colIndex++) {
91 155
			Cell cell = headers.getCell(colIndex);
92 156
			if (cell != null) {
93 157
				String value = cellToString(cell).trim();
94
				if (rules.containsKey(value)) {
95
					cell.setCellValue(rules.get(value));
158
				columns.add(value);
159
				if (lineRules.containsKey(value)) {
160
					columnIdxToTest.add(colIndex);
161
					columnsTest.add(lineRules.get(value));
96 162
				}
163
				
164
				if (columnsSelection.contains(value)) {
165
					columnIdxToWrite.add(colIndex);
166
				}
97 167
			}
98 168
		}
169
		
170
		if (columnIdxToTest.size() == 0) {
171
			System.out.println("Error: no column found with name="+lineRules.keySet());
172
			return false;
173
		}
174
		
175
		if (columnIdxToWrite.size() != columnsSelection.size()) {
176
			System.out.println("Error: not all columns found="+columnIdxToWrite+" of "+columnsSelection);
177
			ArrayList<String> notFound = new ArrayList<String>(columnsSelection);
178
			notFound.removeAll(columns);
179
			System.out.println("NOT FOUND: "+notFound);
180
			return false;
181
		}
182
		
183
		Row row2 = ws2.createRow(0);
184
		int nCell = 0;
185
		for (int iCol : columnIdxToWrite) {
186
			Cell cell2 = row2.createCell(nCell);
187
			cell2.setCellValue(cellToString(headers.getCell(iCol)).trim());
188
			nCell++;
189
		}
190
		
191
		int nRowWritten = 1;
192
		for (int rowIndex = 0 ; rowIndex < nRows ; rowIndex++) {
193
			Row row = ws.getRow(rowIndex);
194
			if (row == null) continue;
195
			for (int i = 0 ; i < columnIdxToTest.size() ; i++) {
196
				int colIndex = columnIdxToTest.get(i);
197
				
198
				Cell cell = row.getCell(colIndex);
199
				if (cell != null) {
200
					String value = cellToString(cell).trim();
201
					if (value.matches(columnsTest.get(i))) {
202
						
203
						// write new line
204
						
205
						row2 = ws2.createRow(nRowWritten);
206
						nCell = 0;
207
						for (int iCol : columnIdxToWrite) {
208
							Cell cell2 = row2.createCell(nCell);
209
							cell2.setCellValue(cellToString(row.getCell(iCol)));
210
							nCell++;
211
						}
212
						nRowWritten++;
213
						
214
						break; // next row
215
					}
216
				}
217
			}
218
		}
219
		
220
		return true;
99 221
	}
100 222
	
101
	public void keepColumns(Collection<String> columnsToKeep) {
223
	/**
224
	 * 
225
	 * @param rules String:old column name ->  String:new column name
226
	 */
227
	private boolean renameColumns(HashMap<String, String> rules) {
102 228
		Row headers = ws.getRow(0);
229
		if (headers == null) return false;
230
		
103 231
		int colMax = headers.getLastCellNum();
104
		ArrayList<Integer> columnsToDelete = new ArrayList<Integer>();
105 232
		for (int colIndex = 0 ; colIndex < colMax ; colIndex++) {
106 233
			Cell cell = headers.getCell(colIndex);
107 234
			if (cell != null) {
108 235
				String value = cellToString(cell).trim();
109
				if (columnsToKeep.contains(value)) {
110
					columnsToDelete.add(colIndex);
236
				if (rules.containsKey(value)) {
237
					String s = rules.get(value);
238
					cell.setCellValue(s);
111 239
				}
112 240
			}
113 241
		}
114
		
115
		Collections.sort(columnsToDelete);
116

  
117
		for (int i = columnsToDelete.size() -1 ; i >= 0 ; i--) {
118
			SheetUtility.deleteColumn(ws, i);
119
		}
242
		return true;
120 243
	}
121 244
	
122 245
	public boolean save() throws IOException {
......
136 259
			header = new String[0];
137 260
			return header != null;
138 261
		}
139

  
262
		
140 263
		Row firstRow = ws.getRow(0);
141 264
		int colMax = firstRow.getLastCellNum();
142

  
265
		
143 266
		header = new String[colMax];
144 267
		for (int it = 0 ; it < colMax; it++) {
145 268
			header[it] = firstRow.getCell(it).getStringCellValue();
......
199 322
	public void close() throws IOException {
200 323
		if (wb != null) wb.close();
201 324
	}
202

  
325
	
203 326
	/**
204 327
	 * 
205 328
	 * @param inputFile
......
207 330
	 * @return list of lines (line = list of cells) with values converted to String - the file is completely read
208 331
	 */
209 332
	public static ArrayList<ArrayList<String>> toTable(File inputFile, String sheetName) {
210

  
333
		
211 334
		ArrayList<ArrayList<String>> data = new ArrayList<ArrayList<String>>();
212

  
335
		
213 336
		if (!inputFile.canRead()) {
214 337
			System.out.println("** Excel2XML: '"+inputFile.getName()+"' file not readable. Aborting.");
215 338
			return data;
216 339
		}
217

  
340
		
218 341
		try {
219 342
			Workbook wb = WorkbookFactory.create(inputFile);
220 343
			Sheet ws;
......
226 349
					ws = wb.getSheetAt(0);
227 350
				}
228 351
			}
229

  
352
			
230 353
			if (ws == null) {
231 354
				System.out.println("** Excel2XML: no sheet found. Aborting.");
232 355
				return data;
233 356
			}
234

  
357
			
235 358
			int nRows = ws.getPhysicalNumberOfRows();
236 359
			if (nRows == 0) return null;
237

  
360
			
238 361
			Row firstRow = ws.getRow(0);
239 362
			int colMax = firstRow.getLastCellNum();
240

  
363
			
241 364
			ArrayList<String> headers = new ArrayList<String>();
242 365
			for (int it = 0 ; it < colMax; it++) {
243 366
				headers.add(firstRow.getCell(it).getStringCellValue());
244 367
			}
245

  
368
			
246 369
			for (int rowIndex = 0 ; rowIndex < nRows ; rowIndex++) {
247 370
				Row row = ws.getRow(rowIndex);
248 371
				ArrayList<String> dataLine = new ArrayList<String>();
......
263 386
			e.printStackTrace();
264 387
			return null;
265 388
		}
266

  
389
		
267 390
		return data;
268 391
	}
269

  
392
	
270 393
	/**
271 394
	 * 
272
	 * @param cell
395
	 * @param cell. If null returns ""
273 396
	 * @return always a String starting with '#' if an error occurs 
274 397
	 */
275 398
	public static String cellToString(Cell cell) {
399
		if (cell == null) return "";
400
		
276 401
		switch (cell.getCellTypeEnum()) {
277
		case FORMULA:
278
			return "#formulla";
279
		case NUMERIC:
280
			if (DateUtil.isCellDateFormatted(cell)) {
281
                SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy");
282
                return dateFormat.format(cell.getDateCellValue());
283
            } else {
284
                Double value = cell.getNumericCellValue();
285
                return value.toString();
286
            }
287
		case STRING:
288
			return cell.getStringCellValue();
289
		case BLANK:
290
			return "";
291
		case BOOLEAN:
292
			return ""+cell.getBooleanCellValue();	
293
		case ERROR:
294
			return "#error";
295
		default:
296
			return "#typeerror";
402
			case FORMULA:
403
				return "#formulla";
404
			case NUMERIC:
405
				if (DateUtil.isCellDateFormatted(cell)) {
406
					SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy");
407
					return dateFormat.format(cell.getDateCellValue());
408
				} else {
409
					Double value = cell.getNumericCellValue();
410
					return value.toString();
411
				}
412
			case STRING:
413
				return cell.getStringCellValue();
414
			case BLANK:
415
				return "";
416
			case BOOLEAN:
417
				return ""+cell.getBooleanCellValue();	
418
			case ERROR:
419
				return "#error";
420
			default:
421
				return "#typeerror";
297 422
		}
298 423
	}
299

  
424
	
300 425
	public static void main(String[] args) throws Exception {
301
//		ArrayList<ArrayList<String>> data = toTable(new File("/home/mdecorde/xml/ruscorpora1m-test/metadata.xlsx"), null);
302
//		if (data.size() == 0) {
303
//			System.out.println("no data.");
304
//		} else {
305
//			System.out.println(data);
306
//		}
426
		//		ArrayList<ArrayList<String>> data = toTable(new File("/home/mdecorde/xml/ruscorpora1m-test/metadata.xlsx"), null);
427
		//		if (data.size() == 0) {
428
		//			System.out.println("no data.");
429
		//		} else {
430
		//			System.out.println(data);
431
		//		}
307 432
		
308
		File tableFile = new File("/home/mdecorde/TEMP/ANTRACT/AF/mini trs fixed/metadata.xlsx");
309
		File table2File = new File("/home/mdecorde/TEMP/ANTRACT/AF/mini trs fixed/metadata2.xlsx");
433
		File tableFile = new File("/home/mdecorde/TEMP/ANTRACT/AF/all.xlsx");
434
		File table2File = null;
310 435
		
311 436
		System.out.println("open...");
312 437
		ReadExcel excel = new ReadExcel(tableFile, null);
313
		System.out.println("keeping...");
314
		excel.keepColumns(Arrays.asList("Date de diffusion", "Durée", "Identifiant de la notice", "Nom fichier segmenté (info)", "Notes du titre", "Titre propre"));
315
		System.out.println("renaming...");
438
		
439
		HashMap<String, String> lineRules = new HashMap<String, String>(); // line tests to select line to keep
440
		List<String> columnsSelection; // lsit of columns to keep
441
		
442

  
443
//		//emissions
444
//		table2File = new File("/home/mdecorde/TEMP/ANTRACT/AF/emissions.xlsx");
445
//		columnsSelection = Arrays.asList(
446
//				"Identifiant de la notice", "Titre propre", "Notes du titre", "Date de diffusion", "Durée", "Nom fichier segmenté (info)", "antract_video",
447
//				"antract_debut","antract_fin","antract_duree","antract_tc_type","antract_tc_date");
448
//		lineRules.put("Type de notice", "Notice sommaire");
449
		
450
		//sujets
451
		table2File = new File("/home/mdecorde/TEMP/ANTRACT/AF/sujets.xlsx");
452
		columnsSelection = Arrays.asList(
453
				"Identifiant de la notice", "Titre propre", "Notes du titre", "Lien notice principale",
454
				"Date de diffusion", "Type de date", "Durée", "Genre", "Langue VO / VE", "Nature de production", "Producteurs (Aff.)", "Thématique",
455
				"Nom fichier segmenté (info)", "antract_video", "antract_debut","antract_fin","antract_duree","antract_tc_type","antract_tc_date",
456
				"Résumé", "Séquences", "Descripteurs (Aff. Lig.)", "Générique (Aff. Lig.)"
457
				);
458
		lineRules.put("Type de notice", "Notice sujet");
459
		
460
		table2File.delete();
461
		ReadExcel excel2 = new ReadExcel(table2File, null);
462
		if (!excel.extractTo(excel2, lineRules, columnsSelection)) {
463
			System.out.println("FAIL");
464
			return;
465
		}
466
		
467
		System.out.println("renaming column...");
316 468
		HashMap<String, String> rules = new HashMap<String, String>();
317
		rules.put("Date de diffusion", "date_diffusion");
318
		rules.put("Durée", "duree");
319
		rules.put("Identifiant de la notice", "");
320
		rules.put("Nom fichier segmenté (info)", "fichier_segmente");
321
		rules.put("Notes du titre", "subtitle");
322
		rules.put("Titre propre", "title");
323
		excel.renameColumns(rules);
324
		System.out.println("saving...");
325
		excel.saveAs(table2File);
469
		rules.put("Identifiant de la notice", "id");
470
		excel2.renameColumns(rules);
471
		
472
		System.out.println("saving&closing...");
473
		excel2.save();
474
		excel2.close();
475
		excel.close();
326 476
		System.out.println("done.");
327 477
	}
328 478
}

Formats disponibles : Unified diff