Révision 2357
tmp/org.txm.libs.msoffice/src/org/txm/libs/msoffice/ReadExcel.java (revision 2357) | ||
---|---|---|
2 | 2 |
|
3 | 3 |
import java.io.BufferedOutputStream; |
4 | 4 |
import java.io.File; |
5 |
import java.io.FileNotFoundException; |
|
6 | 5 |
import java.io.FileOutputStream; |
7 | 6 |
import java.io.IOException; |
8 |
import java.io.InputStream; |
|
9 |
import java.io.OutputStreamWriter; |
|
10 | 7 |
import java.text.SimpleDateFormat; |
11 | 8 |
import java.util.ArrayList; |
12 | 9 |
import java.util.Arrays; |
13 |
import java.util.Collection; |
|
14 |
import java.util.Collections; |
|
15 | 10 |
import java.util.HashMap; |
11 |
import java.util.HashSet; |
|
12 |
import java.util.List; |
|
16 | 13 |
|
17 | 14 |
import org.apache.poi.EncryptedDocumentException; |
18 | 15 |
import org.apache.poi.openxml4j.exceptions.InvalidFormatException; |
... | ... | |
22 | 19 |
import org.apache.poi.ss.usermodel.Sheet; |
23 | 20 |
import org.apache.poi.ss.usermodel.Workbook; |
24 | 21 |
import org.apache.poi.ss.usermodel.WorkbookFactory; |
25 |
import org.apache.poi.ss.util.SheetUtil;
|
|
22 |
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
|
|
26 | 23 |
|
27 | 24 |
public class ReadExcel { |
28 |
|
|
25 |
|
|
29 | 26 |
Workbook wb = null; |
30 | 27 |
Sheet ws = null; |
31 | 28 |
String[] header = null; |
... | ... | |
35 | 32 |
private File tableFile; |
36 | 33 |
|
37 | 34 |
public ReadExcel(File tableFile, String sheetName) throws EncryptedDocumentException, InvalidFormatException, IOException { |
38 |
wb = WorkbookFactory.create(tableFile); |
|
35 |
|
|
39 | 36 |
this.tableFile = tableFile; |
40 | 37 |
|
41 |
if (sheetName == null || sheetName.length() == 0) { |
|
42 |
ws = wb.getSheetAt(0); |
|
43 |
} else { |
|
44 |
ws = wb.getSheet(sheetName); |
|
45 |
if (ws == null) { |
|
38 |
if (tableFile.exists() && tableFile.length() > 0) { |
|
39 |
wb = WorkbookFactory.create(tableFile); |
|
40 |
if (sheetName == null || sheetName.length() == 0) { |
|
46 | 41 |
ws = wb.getSheetAt(0); |
42 |
} else { |
|
43 |
ws = wb.getSheet(sheetName); |
|
44 |
if (ws == null) { |
|
45 |
ws = wb.getSheetAt(0); |
|
46 |
} |
|
47 | 47 |
} |
48 |
|
|
49 |
nRows = ws.getPhysicalNumberOfRows(); |
|
50 |
} else { |
|
51 |
wb = new XSSFWorkbook(); |
|
52 |
ws = wb.createSheet(); |
|
48 | 53 |
} |
49 |
|
|
50 |
nRows = ws.getPhysicalNumberOfRows(); |
|
51 | 54 |
} |
52 | 55 |
|
53 |
private void removeLines(HashMap<String, String> rules) { |
|
56 |
/** |
|
57 |
* one rule (regex test) per column |
|
58 |
* all lines are processed |
|
59 |
* |
|
60 |
* @param rules String: column name to test -> regex to match |
|
61 |
*/ |
|
62 |
private boolean removeLines(HashMap<String, String> rules) { |
|
54 | 63 |
Row headers = ws.getRow(0); |
64 |
if (headers == null) return false; |
|
55 | 65 |
int colMax = headers.getLastCellNum(); |
56 | 66 |
|
57 | 67 |
ArrayList<Integer> columnIdxToTest = new ArrayList<Integer>(); |
... | ... | |
67 | 77 |
} |
68 | 78 |
} |
69 | 79 |
|
80 |
if (columnIdxToTest.size() == 0) { |
|
81 |
System.out.println("Error: no column found with name="+rules.keySet()); |
|
82 |
return false; |
|
83 |
} |
|
84 |
|
|
85 |
ArrayList<Integer> removed = new ArrayList<Integer>(); |
|
70 | 86 |
for (int rowIndex = 0 ; rowIndex < nRows ; rowIndex++) { |
71 | 87 |
Row row = ws.getRow(rowIndex); |
72 |
|
|
88 |
if (row == null) continue; |
|
73 | 89 |
for (int i = 0 ; i < columnIdxToTest.size() ; i++) { |
74 | 90 |
int colIndex = columnIdxToTest.get(i); |
75 | 91 |
|
76 | 92 |
Cell cell = row.getCell(colIndex); |
77 | 93 |
if (cell != null) { |
78 | 94 |
String value = cellToString(cell).trim(); |
79 |
if (columnsTest.get(i).equals(value)) { |
|
80 |
cell.setCellValue(rules.get("TODOOOOOOOOOOOOOOOOO")); |
|
95 |
if (value.matches(columnsTest.get(i))) { |
|
96 |
removed.add(0, rowIndex); |
|
97 |
ws.removeRow(row); |
|
98 |
break; // next row |
|
81 | 99 |
} |
82 | 100 |
} |
83 | 101 |
} |
84 | 102 |
} |
103 |
|
|
104 |
// TODO very slow when there is a lot of lines |
|
105 |
for (int rowIndex : removed) { |
|
106 |
ws.shiftRows(rowIndex+1, nRows, -1); |
|
107 |
nRows--; |
|
108 |
} |
|
109 |
|
|
110 |
return true; |
|
85 | 111 |
} |
86 | 112 |
|
87 |
private void renameColumns(HashMap<String, String> rules) { |
|
113 |
/** |
|
114 |
* extract lines and columns following rules. |
|
115 |
* |
|
116 |
* don't forget to save the ReadeExcel object |
|
117 |
* |
|
118 |
* @param excel2 |
|
119 |
* @param lineRules |
|
120 |
* @param columnsSelection |
|
121 |
* @return |
|
122 |
* @throws EncryptedDocumentException |
|
123 |
* @throws InvalidFormatException |
|
124 |
* @throws IOException |
|
125 |
*/ |
|
126 |
public boolean extractTo(ReadExcel excel2, HashMap<String, String> lineRules, List<String> columnsSelection) throws EncryptedDocumentException, InvalidFormatException, IOException { |
|
127 |
Sheet ws2 = excel2.ws; |
|
128 |
if (!extractTo(ws2, lineRules, columnsSelection)) { |
|
129 |
return false; |
|
130 |
} |
|
131 |
|
|
132 |
return true; |
|
133 |
} |
|
134 |
|
|
135 |
/** |
|
136 |
* one rule (regex test) per column |
|
137 |
* all lines are processed |
|
138 |
* |
|
139 |
* @param lineRules String: column name to test -> regex to match |
|
140 |
* @throws IOException |
|
141 |
* @throws InvalidFormatException |
|
142 |
* @throws EncryptedDocumentException |
|
143 |
*/ |
|
144 |
public boolean extractTo(Sheet ws2, HashMap<String, String> lineRules, List<String> columnsSelection) throws EncryptedDocumentException, InvalidFormatException, IOException { |
|
88 | 145 |
Row headers = ws.getRow(0); |
146 |
if (headers == null) return false; |
|
147 |
|
|
89 | 148 |
int colMax = headers.getLastCellNum(); |
149 |
|
|
150 |
ArrayList<Integer> columnIdxToTest = new ArrayList<Integer>(); |
|
151 |
ArrayList<Integer> columnIdxToWrite = new ArrayList<Integer>(); |
|
152 |
ArrayList<String> columns = new ArrayList<String>(); |
|
153 |
ArrayList<String> columnsTest = new ArrayList<String>(); |
|
90 | 154 |
for (int colIndex = 0 ; colIndex < colMax ; colIndex++) { |
91 | 155 |
Cell cell = headers.getCell(colIndex); |
92 | 156 |
if (cell != null) { |
93 | 157 |
String value = cellToString(cell).trim(); |
94 |
if (rules.containsKey(value)) { |
|
95 |
cell.setCellValue(rules.get(value)); |
|
158 |
columns.add(value); |
|
159 |
if (lineRules.containsKey(value)) { |
|
160 |
columnIdxToTest.add(colIndex); |
|
161 |
columnsTest.add(lineRules.get(value)); |
|
96 | 162 |
} |
163 |
|
|
164 |
if (columnsSelection.contains(value)) { |
|
165 |
columnIdxToWrite.add(colIndex); |
|
166 |
} |
|
97 | 167 |
} |
98 | 168 |
} |
169 |
|
|
170 |
if (columnIdxToTest.size() == 0) { |
|
171 |
System.out.println("Error: no column found with name="+lineRules.keySet()); |
|
172 |
return false; |
|
173 |
} |
|
174 |
|
|
175 |
if (columnIdxToWrite.size() != columnsSelection.size()) { |
|
176 |
System.out.println("Error: not all columns found="+columnIdxToWrite+" of "+columnsSelection); |
|
177 |
ArrayList<String> notFound = new ArrayList<String>(columnsSelection); |
|
178 |
notFound.removeAll(columns); |
|
179 |
System.out.println("NOT FOUND: "+notFound); |
|
180 |
return false; |
|
181 |
} |
|
182 |
|
|
183 |
Row row2 = ws2.createRow(0); |
|
184 |
int nCell = 0; |
|
185 |
for (int iCol : columnIdxToWrite) { |
|
186 |
Cell cell2 = row2.createCell(nCell); |
|
187 |
cell2.setCellValue(cellToString(headers.getCell(iCol)).trim()); |
|
188 |
nCell++; |
|
189 |
} |
|
190 |
|
|
191 |
int nRowWritten = 1; |
|
192 |
for (int rowIndex = 0 ; rowIndex < nRows ; rowIndex++) { |
|
193 |
Row row = ws.getRow(rowIndex); |
|
194 |
if (row == null) continue; |
|
195 |
for (int i = 0 ; i < columnIdxToTest.size() ; i++) { |
|
196 |
int colIndex = columnIdxToTest.get(i); |
|
197 |
|
|
198 |
Cell cell = row.getCell(colIndex); |
|
199 |
if (cell != null) { |
|
200 |
String value = cellToString(cell).trim(); |
|
201 |
if (value.matches(columnsTest.get(i))) { |
|
202 |
|
|
203 |
// write new line |
|
204 |
|
|
205 |
row2 = ws2.createRow(nRowWritten); |
|
206 |
nCell = 0; |
|
207 |
for (int iCol : columnIdxToWrite) { |
|
208 |
Cell cell2 = row2.createCell(nCell); |
|
209 |
cell2.setCellValue(cellToString(row.getCell(iCol))); |
|
210 |
nCell++; |
|
211 |
} |
|
212 |
nRowWritten++; |
|
213 |
|
|
214 |
break; // next row |
|
215 |
} |
|
216 |
} |
|
217 |
} |
|
218 |
} |
|
219 |
|
|
220 |
return true; |
|
99 | 221 |
} |
100 | 222 |
|
101 |
public void keepColumns(Collection<String> columnsToKeep) { |
|
223 |
/** |
|
224 |
* |
|
225 |
* @param rules String:old column name -> String:new column name |
|
226 |
*/ |
|
227 |
private boolean renameColumns(HashMap<String, String> rules) { |
|
102 | 228 |
Row headers = ws.getRow(0); |
229 |
if (headers == null) return false; |
|
230 |
|
|
103 | 231 |
int colMax = headers.getLastCellNum(); |
104 |
ArrayList<Integer> columnsToDelete = new ArrayList<Integer>(); |
|
105 | 232 |
for (int colIndex = 0 ; colIndex < colMax ; colIndex++) { |
106 | 233 |
Cell cell = headers.getCell(colIndex); |
107 | 234 |
if (cell != null) { |
108 | 235 |
String value = cellToString(cell).trim(); |
109 |
if (columnsToKeep.contains(value)) { |
|
110 |
columnsToDelete.add(colIndex); |
|
236 |
if (rules.containsKey(value)) { |
|
237 |
String s = rules.get(value); |
|
238 |
cell.setCellValue(s); |
|
111 | 239 |
} |
112 | 240 |
} |
113 | 241 |
} |
114 |
|
|
115 |
Collections.sort(columnsToDelete); |
|
116 |
|
|
117 |
for (int i = columnsToDelete.size() -1 ; i >= 0 ; i--) { |
|
118 |
SheetUtility.deleteColumn(ws, i); |
|
119 |
} |
|
242 |
return true; |
|
120 | 243 |
} |
121 | 244 |
|
122 | 245 |
public boolean save() throws IOException { |
... | ... | |
136 | 259 |
header = new String[0]; |
137 | 260 |
return header != null; |
138 | 261 |
} |
139 |
|
|
262 |
|
|
140 | 263 |
Row firstRow = ws.getRow(0); |
141 | 264 |
int colMax = firstRow.getLastCellNum(); |
142 |
|
|
265 |
|
|
143 | 266 |
header = new String[colMax]; |
144 | 267 |
for (int it = 0 ; it < colMax; it++) { |
145 | 268 |
header[it] = firstRow.getCell(it).getStringCellValue(); |
... | ... | |
199 | 322 |
public void close() throws IOException { |
200 | 323 |
if (wb != null) wb.close(); |
201 | 324 |
} |
202 |
|
|
325 |
|
|
203 | 326 |
/** |
204 | 327 |
* |
205 | 328 |
* @param inputFile |
... | ... | |
207 | 330 |
* @return list of lines (line = list of cells) with values converted to String - the file is completely read |
208 | 331 |
*/ |
209 | 332 |
public static ArrayList<ArrayList<String>> toTable(File inputFile, String sheetName) { |
210 |
|
|
333 |
|
|
211 | 334 |
ArrayList<ArrayList<String>> data = new ArrayList<ArrayList<String>>(); |
212 |
|
|
335 |
|
|
213 | 336 |
if (!inputFile.canRead()) { |
214 | 337 |
System.out.println("** Excel2XML: '"+inputFile.getName()+"' file not readable. Aborting."); |
215 | 338 |
return data; |
216 | 339 |
} |
217 |
|
|
340 |
|
|
218 | 341 |
try { |
219 | 342 |
Workbook wb = WorkbookFactory.create(inputFile); |
220 | 343 |
Sheet ws; |
... | ... | |
226 | 349 |
ws = wb.getSheetAt(0); |
227 | 350 |
} |
228 | 351 |
} |
229 |
|
|
352 |
|
|
230 | 353 |
if (ws == null) { |
231 | 354 |
System.out.println("** Excel2XML: no sheet found. Aborting."); |
232 | 355 |
return data; |
233 | 356 |
} |
234 |
|
|
357 |
|
|
235 | 358 |
int nRows = ws.getPhysicalNumberOfRows(); |
236 | 359 |
if (nRows == 0) return null; |
237 |
|
|
360 |
|
|
238 | 361 |
Row firstRow = ws.getRow(0); |
239 | 362 |
int colMax = firstRow.getLastCellNum(); |
240 |
|
|
363 |
|
|
241 | 364 |
ArrayList<String> headers = new ArrayList<String>(); |
242 | 365 |
for (int it = 0 ; it < colMax; it++) { |
243 | 366 |
headers.add(firstRow.getCell(it).getStringCellValue()); |
244 | 367 |
} |
245 |
|
|
368 |
|
|
246 | 369 |
for (int rowIndex = 0 ; rowIndex < nRows ; rowIndex++) { |
247 | 370 |
Row row = ws.getRow(rowIndex); |
248 | 371 |
ArrayList<String> dataLine = new ArrayList<String>(); |
... | ... | |
263 | 386 |
e.printStackTrace(); |
264 | 387 |
return null; |
265 | 388 |
} |
266 |
|
|
389 |
|
|
267 | 390 |
return data; |
268 | 391 |
} |
269 |
|
|
392 |
|
|
270 | 393 |
/** |
271 | 394 |
* |
272 |
* @param cell |
|
395 |
* @param cell. If null returns ""
|
|
273 | 396 |
* @return always a String starting with '#' if an error occurs |
274 | 397 |
*/ |
275 | 398 |
public static String cellToString(Cell cell) { |
399 |
if (cell == null) return ""; |
|
400 |
|
|
276 | 401 |
switch (cell.getCellTypeEnum()) { |
277 |
case FORMULA: |
|
278 |
return "#formulla"; |
|
279 |
case NUMERIC: |
|
280 |
if (DateUtil.isCellDateFormatted(cell)) { |
|
281 |
SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy");
|
|
282 |
return dateFormat.format(cell.getDateCellValue());
|
|
283 |
} else {
|
|
284 |
Double value = cell.getNumericCellValue();
|
|
285 |
return value.toString();
|
|
286 |
}
|
|
287 |
case STRING: |
|
288 |
return cell.getStringCellValue(); |
|
289 |
case BLANK: |
|
290 |
return ""; |
|
291 |
case BOOLEAN: |
|
292 |
return ""+cell.getBooleanCellValue(); |
|
293 |
case ERROR: |
|
294 |
return "#error"; |
|
295 |
default: |
|
296 |
return "#typeerror"; |
|
402 |
case FORMULA:
|
|
403 |
return "#formulla";
|
|
404 |
case NUMERIC:
|
|
405 |
if (DateUtil.isCellDateFormatted(cell)) {
|
|
406 |
SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy");
|
|
407 |
return dateFormat.format(cell.getDateCellValue());
|
|
408 |
} else {
|
|
409 |
Double value = cell.getNumericCellValue();
|
|
410 |
return value.toString();
|
|
411 |
}
|
|
412 |
case STRING:
|
|
413 |
return cell.getStringCellValue();
|
|
414 |
case BLANK:
|
|
415 |
return "";
|
|
416 |
case BOOLEAN:
|
|
417 |
return ""+cell.getBooleanCellValue();
|
|
418 |
case ERROR:
|
|
419 |
return "#error";
|
|
420 |
default:
|
|
421 |
return "#typeerror";
|
|
297 | 422 |
} |
298 | 423 |
} |
299 |
|
|
424 |
|
|
300 | 425 |
public static void main(String[] args) throws Exception { |
301 |
// ArrayList<ArrayList<String>> data = toTable(new File("/home/mdecorde/xml/ruscorpora1m-test/metadata.xlsx"), null); |
|
302 |
// if (data.size() == 0) { |
|
303 |
// System.out.println("no data."); |
|
304 |
// } else { |
|
305 |
// System.out.println(data); |
|
306 |
// } |
|
426 |
// ArrayList<ArrayList<String>> data = toTable(new File("/home/mdecorde/xml/ruscorpora1m-test/metadata.xlsx"), null);
|
|
427 |
// if (data.size() == 0) {
|
|
428 |
// System.out.println("no data.");
|
|
429 |
// } else {
|
|
430 |
// System.out.println(data);
|
|
431 |
// }
|
|
307 | 432 |
|
308 |
File tableFile = new File("/home/mdecorde/TEMP/ANTRACT/AF/mini trs fixed/metadata.xlsx");
|
|
309 |
File table2File = new File("/home/mdecorde/TEMP/ANTRACT/AF/mini trs fixed/metadata2.xlsx");
|
|
433 |
File tableFile = new File("/home/mdecorde/TEMP/ANTRACT/AF/all.xlsx");
|
|
434 |
File table2File = null;
|
|
310 | 435 |
|
311 | 436 |
System.out.println("open..."); |
312 | 437 |
ReadExcel excel = new ReadExcel(tableFile, null); |
313 |
System.out.println("keeping..."); |
|
314 |
excel.keepColumns(Arrays.asList("Date de diffusion", "Durée", "Identifiant de la notice", "Nom fichier segmenté (info)", "Notes du titre", "Titre propre")); |
|
315 |
System.out.println("renaming..."); |
|
438 |
|
|
439 |
HashMap<String, String> lineRules = new HashMap<String, String>(); // line tests to select line to keep |
|
440 |
List<String> columnsSelection; // lsit of columns to keep |
|
441 |
|
|
442 |
|
|
443 |
// //emissions |
|
444 |
// table2File = new File("/home/mdecorde/TEMP/ANTRACT/AF/emissions.xlsx"); |
|
445 |
// columnsSelection = Arrays.asList( |
|
446 |
// "Identifiant de la notice", "Titre propre", "Notes du titre", "Date de diffusion", "Durée", "Nom fichier segmenté (info)", "antract_video", |
|
447 |
// "antract_debut","antract_fin","antract_duree","antract_tc_type","antract_tc_date"); |
|
448 |
// lineRules.put("Type de notice", "Notice sommaire"); |
|
449 |
|
|
450 |
//sujets |
|
451 |
table2File = new File("/home/mdecorde/TEMP/ANTRACT/AF/sujets.xlsx"); |
|
452 |
columnsSelection = Arrays.asList( |
|
453 |
"Identifiant de la notice", "Titre propre", "Notes du titre", "Lien notice principale", |
|
454 |
"Date de diffusion", "Type de date", "Durée", "Genre", "Langue VO / VE", "Nature de production", "Producteurs (Aff.)", "Thématique", |
|
455 |
"Nom fichier segmenté (info)", "antract_video", "antract_debut","antract_fin","antract_duree","antract_tc_type","antract_tc_date", |
|
456 |
"Résumé", "Séquences", "Descripteurs (Aff. Lig.)", "Générique (Aff. Lig.)" |
|
457 |
); |
|
458 |
lineRules.put("Type de notice", "Notice sujet"); |
|
459 |
|
|
460 |
table2File.delete(); |
|
461 |
ReadExcel excel2 = new ReadExcel(table2File, null); |
|
462 |
if (!excel.extractTo(excel2, lineRules, columnsSelection)) { |
|
463 |
System.out.println("FAIL"); |
|
464 |
return; |
|
465 |
} |
|
466 |
|
|
467 |
System.out.println("renaming column..."); |
|
316 | 468 |
HashMap<String, String> rules = new HashMap<String, String>(); |
317 |
rules.put("Date de diffusion", "date_diffusion"); |
|
318 |
rules.put("Durée", "duree"); |
|
319 |
rules.put("Identifiant de la notice", ""); |
|
320 |
rules.put("Nom fichier segmenté (info)", "fichier_segmente"); |
|
321 |
rules.put("Notes du titre", "subtitle"); |
|
322 |
rules.put("Titre propre", "title"); |
|
323 |
excel.renameColumns(rules); |
|
324 |
System.out.println("saving..."); |
|
325 |
excel.saveAs(table2File); |
|
469 |
rules.put("Identifiant de la notice", "id"); |
|
470 |
excel2.renameColumns(rules); |
|
471 |
|
|
472 |
System.out.println("saving&closing..."); |
|
473 |
excel2.save(); |
|
474 |
excel2.close(); |
|
475 |
excel.close(); |
|
326 | 476 |
System.out.println("done."); |
327 | 477 |
} |
328 | 478 |
} |
Formats disponibles : Unified diff