|
@@ -4,7 +4,6 @@ import com.alibaba.fastjson.JSONArray;
|
|
|
import com.alibaba.fastjson.JSONObject;
|
|
|
import lombok.AllArgsConstructor;
|
|
|
import lombok.Getter;
|
|
|
-import org.apache.ibatis.jdbc.Null;
|
|
|
import org.apache.logging.log4j.util.Strings;
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
import org.apache.pdfbox.pdmodel.PDPage;
|
|
@@ -239,13 +238,13 @@ public class PDFTableReader<T> {
|
|
|
Map<String, Float> points = new HashMap<>();
|
|
|
//获取文档坐标
|
|
|
try {
|
|
|
- PDDocument document = PDDocument.load(file.getInputStream());
|
|
|
+ PDDocument document = PDDocument.load(file.getInputStream());
|
|
|
PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
@Override
|
|
|
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
if (text.contains(sourceTex)) {
|
|
|
TextPosition textPositionStart = textPositions.get(0);
|
|
|
- TextPosition textPositionEnd = textPositions.get(textPositions.size()-1);
|
|
|
+ TextPosition textPositionEnd = textPositions.get(textPositions.size() - 1);
|
|
|
points.put("x", textPositionStart.getX());
|
|
|
points.put("y", textPositionStart.getX());
|
|
|
}
|
|
@@ -273,23 +272,23 @@ public class PDFTableReader<T> {
|
|
|
PDDocument document = null;
|
|
|
try {
|
|
|
inputStream = file.getInputStream();
|
|
|
- document = PDDocument.load(inputStream);
|
|
|
+ document = PDDocument.load(inputStream);
|
|
|
PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
@Override
|
|
|
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
- if(text.contains(startText)) {
|
|
|
+ if (text.contains(startText)) {
|
|
|
int startIndex = 0;
|
|
|
int endIndex = text.length();
|
|
|
- if(Strings.isNotBlank(startText)) {
|
|
|
+ if (Strings.isNotBlank(startText)) {
|
|
|
startIndex = text.indexOf(startText);
|
|
|
}
|
|
|
- if(Strings.isNotBlank(endText)) {
|
|
|
+ if (Strings.isNotBlank(endText)) {
|
|
|
endIndex = text.indexOf(endText, startIndex + startText.length());
|
|
|
}
|
|
|
- if(startIndex == -1) {
|
|
|
+ if (startIndex == -1) {
|
|
|
startIndex = 0;
|
|
|
}
|
|
|
- if(endIndex == -1) {
|
|
|
+ if (endIndex == -1) {
|
|
|
endIndex = text.length();
|
|
|
}
|
|
|
result.add(text.substring(startIndex + startText.length(), endIndex).trim());
|
|
@@ -308,15 +307,15 @@ public class PDFTableReader<T> {
|
|
|
|
|
|
} catch (IOException e) {
|
|
|
e.printStackTrace();
|
|
|
- }finally {
|
|
|
- if(document != null){
|
|
|
+ } finally {
|
|
|
+ if (document != null) {
|
|
|
try {
|
|
|
document.close();
|
|
|
} catch (IOException e) {
|
|
|
e.printStackTrace();
|
|
|
}
|
|
|
}
|
|
|
- if(inputStream != null){
|
|
|
+ if (inputStream != null) {
|
|
|
try {
|
|
|
inputStream.close();
|
|
|
} catch (IOException e) {
|
|
@@ -325,7 +324,7 @@ public class PDFTableReader<T> {
|
|
|
}
|
|
|
|
|
|
}
|
|
|
- if(result == null || result.size() == 0) {
|
|
|
+ if (result == null || result.size() == 0) {
|
|
|
result.add(" ");
|
|
|
}
|
|
|
return result;
|
|
@@ -340,20 +339,20 @@ public class PDFTableReader<T> {
|
|
|
PDDocument document = null;
|
|
|
try {
|
|
|
inputStream = file.getInputStream();
|
|
|
- document = PDDocument.load(inputStream);
|
|
|
+ document = PDDocument.load(inputStream);
|
|
|
PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
@Override
|
|
|
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
- if(text.contains(lineStartText)) {
|
|
|
+ if (text.contains(lineStartText)) {
|
|
|
startRecord[0] = true;
|
|
|
}
|
|
|
- if(Strings.isNotEmpty(lineEndText)) {
|
|
|
- if(text.contains(lineEndText)) {
|
|
|
+ if (Strings.isNotEmpty(lineEndText)) {
|
|
|
+ if (text.contains(lineEndText)) {
|
|
|
startRecord[0] = false;
|
|
|
return;
|
|
|
}
|
|
|
}
|
|
|
- if(startRecord[0]) {
|
|
|
+ if (startRecord[0]) {
|
|
|
result.add(text.trim());
|
|
|
}
|
|
|
}
|
|
@@ -369,15 +368,15 @@ public class PDFTableReader<T> {
|
|
|
|
|
|
} catch (IOException e) {
|
|
|
e.printStackTrace();
|
|
|
- }finally {
|
|
|
- if(document != null){
|
|
|
+ } finally {
|
|
|
+ if (document != null) {
|
|
|
try {
|
|
|
document.close();
|
|
|
} catch (IOException e) {
|
|
|
e.printStackTrace();
|
|
|
}
|
|
|
}
|
|
|
- if(inputStream != null){
|
|
|
+ if (inputStream != null) {
|
|
|
try {
|
|
|
inputStream.close();
|
|
|
} catch (IOException e) {
|
|
@@ -386,7 +385,7 @@ public class PDFTableReader<T> {
|
|
|
}
|
|
|
|
|
|
}
|
|
|
- if(result == null || result.size() == 0) {
|
|
|
+ if (result == null || result.size() == 0) {
|
|
|
result.add(" ");
|
|
|
}
|
|
|
return result;
|
|
@@ -400,15 +399,15 @@ public class PDFTableReader<T> {
|
|
|
PDDocument document = null;
|
|
|
try {
|
|
|
inputStream = file.getInputStream();
|
|
|
- document = PDDocument.load(inputStream);
|
|
|
+ document = PDDocument.load(inputStream);
|
|
|
PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
@Override
|
|
|
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
- if(text.contains(lineStartText)) {
|
|
|
+ if (text.contains(lineStartText)) {
|
|
|
startRecord[0] = true;
|
|
|
return;
|
|
|
}
|
|
|
- if(startRecord[0]) {
|
|
|
+ if (startRecord[0]) {
|
|
|
result.add(text.trim());
|
|
|
startRecord[0] = false;
|
|
|
}
|
|
@@ -426,15 +425,15 @@ public class PDFTableReader<T> {
|
|
|
|
|
|
} catch (IOException e) {
|
|
|
e.printStackTrace();
|
|
|
- }finally {
|
|
|
- if(document != null){
|
|
|
+ } finally {
|
|
|
+ if (document != null) {
|
|
|
try {
|
|
|
document.close();
|
|
|
} catch (IOException e) {
|
|
|
e.printStackTrace();
|
|
|
}
|
|
|
}
|
|
|
- if(inputStream != null){
|
|
|
+ if (inputStream != null) {
|
|
|
try {
|
|
|
inputStream.close();
|
|
|
} catch (IOException e) {
|
|
@@ -443,35 +442,36 @@ public class PDFTableReader<T> {
|
|
|
}
|
|
|
|
|
|
}
|
|
|
- if(result == null || result.size() == 0) {
|
|
|
+ if (result == null || result.size() == 0) {
|
|
|
result.add(" ");
|
|
|
}
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
- public static JSONArray getTableByPosition(MultipartFile file, List<String> fields, List<String> ignoreFields, String endKey, int tableEndIndex, String headerAlignment, String standardX, String headerYText) {
|
|
|
- //获取文档坐标
|
|
|
+ public static JSONArray getTableByPosition(MultipartFile file, List<String> fields, List<String> ignoreFields, String endKey, int tableEndIndex, String headerAlignment, String standardX, String headerYText, String[] extraTableLine, int dataColumn, int headerLimit) {
|
|
|
JSONArray result = new JSONArray();
|
|
|
Map<String, List<PdfTextPosition>> documentPositions = new HashMap<>();
|
|
|
Map<Float, JSONObject> limitObject = new HashMap<>();
|
|
|
- PdfTable pdfTable = getHeaderAndIgnoreLine(file, fields, ignoreFields, endKey, tableEndIndex, documentPositions, headerAlignment, standardX, headerYText);
|
|
|
-
|
|
|
- List<PdfCell> cells = pdfTable.getHeaderRow().getCell().stream().sorted(Comparator.comparing(PdfCell::getCellStartX)).collect(Collectors.toList());
|
|
|
+ Map<Float, JSONObject> resultObject = new HashMap<>();
|
|
|
+ PdfTable pdfTable = getHeaderAndIgnoreLine(file, fields, ignoreFields, endKey, tableEndIndex, documentPositions, headerAlignment, standardX, headerYText, extraTableLine, headerLimit);
|
|
|
+ Map<Float, PdfRow> rowMap = pdfTable.getPdfRows() == null ? new HashMap<>() : pdfTable.getPdfRows();
|
|
|
+ List<PdfRow> headerRows = pdfTable.getHeaderRows();
|
|
|
+ List<PdfCell> cells = headerRows.get(0).getCell().stream().sorted(Comparator.comparing(PdfCell::getCellStartX)).collect(Collectors.toList());
|
|
|
int index = 0;
|
|
|
- cells = cells.stream().filter(item -> !(item.getCellY() < pdfTable.getHeaderRow().getRowY())).collect(Collectors.toList());
|
|
|
- for(PdfCell cell : cells) {
|
|
|
+ cells = cells.stream().filter(item -> !(item.getCellY() < headerRows.get(0).getRowY())).collect(Collectors.toList());
|
|
|
+ for (PdfCell cell : cells) {
|
|
|
//设置第一个表头宽度
|
|
|
- if(index == 0) {
|
|
|
+ if (index == 0) {
|
|
|
if (headerAlignment.equals("mediate")) {
|
|
|
- float space = cell.getCellStartX() - pdfTable.getStandardPosition();
|
|
|
+ float space = cell.getCellStartX() - pdfTable.getStandardPosition() + 1;
|
|
|
cell.setCellStartX(pdfTable.getStandardPosition());
|
|
|
//单元格结束x值为起始x值 - 偏移量 + 结束x值
|
|
|
cell.setCellEndX(cell.getCellEndX() + space);
|
|
|
} else if (headerAlignment.equals("left")) {
|
|
|
cell.setCellStartX(0);
|
|
|
}
|
|
|
- index ++;
|
|
|
- }else {
|
|
|
+ index++;
|
|
|
+ } else {
|
|
|
if (headerAlignment.equals("mediate")) {
|
|
|
//获取前一个单元格
|
|
|
PdfCell prePdfCell = cells.get(index - 1);
|
|
@@ -482,31 +482,123 @@ public class PDFTableReader<T> {
|
|
|
//获取前一个单元格
|
|
|
PdfCell prePdfCell = cells.get(index - 1);
|
|
|
//设置前一个单元格的endx值为当前单元格的startx
|
|
|
- prePdfCell.setCellEndX(cell.getCellStartX());
|
|
|
- if (cell.getHeaderText().equals(fields.get(fields.size() - 1))) {
|
|
|
+ prePdfCell.setCellEndX(cell.getCellStartX() - headerLimit);
|
|
|
+ if (cell.getText().equals(fields.get(fields.size() - 1))) {
|
|
|
cell.setCellEndX(pdfTable.getWidth());
|
|
|
}
|
|
|
}
|
|
|
- index ++;
|
|
|
+ index++;
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ //表格内容处理,有时会将一行数据全部识别到一个单元格里,这边做数据拆分
|
|
|
+ List<String> removeKey = new ArrayList<>();
|
|
|
+ Map<String, List<PdfTextPosition>> addPosition = new HashMap<>();
|
|
|
+ for (String key : documentPositions.keySet()) {
|
|
|
+ List<PdfTextPosition> textPositions = documentPositions.get(key);
|
|
|
+ try {
|
|
|
+ PdfRow cellRow = rowMap.get(textPositions.get(0).getY()) == null ? new PdfRow() : rowMap.get(textPositions.get(0).getY());
|
|
|
+ List<PdfCell> rowCells = cellRow.getCell() == null ? new ArrayList<>() : cellRow.getCell();
|
|
|
+ PdfCell cell = new PdfCell();
|
|
|
+
|
|
|
+
|
|
|
+ boolean isRemove = false;
|
|
|
+ if ((textPositions.get(0).getY() > pdfTable.getTableStartY() &&
|
|
|
+ textPositions.get(0).getY() < pdfTable.getTableEndY())
|
|
|
+ && pdfTable.getIgnoreRows().get(textPositions.get(0).getY()) == null) {
|
|
|
+ List<PdfTextPosition> limitList = new ArrayList<>();
|
|
|
+ StringBuffer limitKey = new StringBuffer();
|
|
|
+ for (PdfCell pdfCell : cells) {
|
|
|
+ limitList = new ArrayList<>();
|
|
|
+ limitKey = new StringBuffer();
|
|
|
+ for (PdfTextPosition position : documentPositions.get(key)) {
|
|
|
+ if (position.getX() >= pdfCell.getCellStartX() && position.getX() < pdfCell.getCellEndX()) {
|
|
|
+ limitList.add(position);
|
|
|
+ limitKey.append(position.getUnicode());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (limitList != null && limitList.size() > 0) {
|
|
|
+ String newKey = limitKey + "-@@@-" + (limitList.get(0).getX() + limitList.get(0).getY());
|
|
|
+ if (documentPositions.get(newKey) == null) {
|
|
|
+ isRemove = true;
|
|
|
+ } else {
|
|
|
+ cell = new PdfCell();
|
|
|
+ cell.setPositions(limitList);
|
|
|
+ cell.setText(String.valueOf(limitKey));
|
|
|
+ cell.setCellY(textPositions.get(0).getY());
|
|
|
+ cell.setCellStartX(textPositions.get(0).getX());
|
|
|
+ cell.setCellEndX(textPositions.get(textPositions.size() - 1).getX());
|
|
|
+ cell.setPdFont(textPositions.get(0).getFont());
|
|
|
+ rowCells.add(cell);
|
|
|
+ cellRow.setCell(rowCells);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ addPosition.put(newKey, limitList);
|
|
|
+ cell = new PdfCell();
|
|
|
+ cell.setPositions(limitList);
|
|
|
+ cell.setText(String.valueOf(limitKey));
|
|
|
+ cell.setCellY(textPositions.get(0).getY());
|
|
|
+ cell.setCellStartX(textPositions.get(0).getX());
|
|
|
+ cell.setCellEndX(textPositions.get(textPositions.size() - 1).getX());
|
|
|
+ cell.setPdFont(textPositions.get(0).getFont());
|
|
|
+ rowCells.add(cell);
|
|
|
+ cellRow.setCell(rowCells);
|
|
|
+
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ } else {
|
|
|
+ cell.setPositions(textPositions);
|
|
|
+ cell.setText(String.valueOf(key).split("-@@@-")[0]);
|
|
|
+ cell.setCellY(textPositions.get(0).getY());
|
|
|
+ cell.setCellStartX(textPositions.get(0).getX());
|
|
|
+ cell.setCellEndX(textPositions.get(textPositions.size() - 1).getX());
|
|
|
+ cell.setPdFont(textPositions.get(0).getFont());
|
|
|
+ rowCells.add(cell);
|
|
|
+ cellRow.setCell(rowCells);
|
|
|
+ }
|
|
|
+
|
|
|
+ cellRow.setRowY(textPositions.get(0).getY());
|
|
|
+ rowMap.put(cell.getCellY(), cellRow);
|
|
|
+
|
|
|
+ if (isRemove) {
|
|
|
+ removeKey.add(key);
|
|
|
+ }
|
|
|
+ } catch (Exception e) {
|
|
|
+ e.printStackTrace();
|
|
|
}
|
|
|
|
|
|
}
|
|
|
- pdfTable.getHeaderRow().setCell(cells);
|
|
|
+ pdfTable.setPdfRows(rowMap);
|
|
|
+ if (removeKey != null && removeKey.size() > 0) {
|
|
|
+ for (String key : removeKey) {
|
|
|
+ documentPositions.remove(key);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ documentPositions.putAll(addPosition);
|
|
|
+
|
|
|
+ List<Float> rows = new ArrayList<>(rowMap.keySet());
|
|
|
+ Collections.sort(rows);
|
|
|
+ headerRows.get(0).setCell(cells);
|
|
|
for (String key : documentPositions.keySet()) {
|
|
|
List<PdfTextPosition> item = documentPositions.get(key);
|
|
|
//处理在表格起始行下和结束行上并且不在忽略行中的数据
|
|
|
if ((item.get(0).getY() > pdfTable.getTableStartY() &&
|
|
|
item.get(0).getY() < pdfTable.getTableEndY())
|
|
|
&& pdfTable.getIgnoreRows().get(item.get(0).getY()) == null) {
|
|
|
- for (PdfCell pdfCell : pdfTable.getHeaderRow().getCell()) {
|
|
|
+ for (PdfCell pdfCell : headerRows.get(0).getCell()) {
|
|
|
//判断此文本的起始x值或结束x值是否存在于表格头对应的单元格的x值区间内,是就添加为此表头对应的数据
|
|
|
float cellStartX = item.get(0).getX();
|
|
|
float cellY = item.get(0).getY();
|
|
|
float cellEndX = item.get(item.size() - 1).getX();
|
|
|
if (cellEndX < pdfCell.getCellEndX() && cellEndX > pdfCell.getCellStartX()) {
|
|
|
JSONObject jsonObject = limitObject.get(cellY) == null ? new JSONObject() : limitObject.get(cellY);
|
|
|
- if(jsonObject.get(pdfCell.getHeaderText()) == null) {
|
|
|
- jsonObject.put(pdfCell.getHeaderText(), key.split("-@@@-")[0]);
|
|
|
+ if (jsonObject.get(pdfCell.getText()) == null) {
|
|
|
+ jsonObject.put(pdfCell.getText(), key.split("-@@@-")[0]);
|
|
|
}
|
|
|
limitObject.put(cellY, jsonObject);
|
|
|
}
|
|
@@ -514,27 +606,116 @@ public class PDFTableReader<T> {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ List<Float> dataLines = new ArrayList<>();
|
|
|
+
|
|
|
+ dataColumn = 0;
|
|
|
for(Float key : limitObject.keySet()) {
|
|
|
- result.add(limitObject.get(key));
|
|
|
+ if(limitObject.get(key).size() > dataColumn) {
|
|
|
+ dataColumn = limitObject.get(key).size();
|
|
|
+ }
|
|
|
}
|
|
|
- if (result == null || result.size() == 0) {
|
|
|
- result.add(" ");
|
|
|
+ for (Float key : limitObject.keySet()) {
|
|
|
+ boolean isHeader = false;
|
|
|
+ //去除标题行
|
|
|
+
|
|
|
+ if (limitObject.get(key).size() >= dataColumn) {
|
|
|
+ for (PdfRow headerRow : headerRows) {
|
|
|
+ if (headerRow.getRowY() == key) {
|
|
|
+ isHeader = true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (isHeader) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ resultObject.put(key, limitObject.get(key));
|
|
|
+ //result.add(limitObject.get(key));
|
|
|
+ dataLines.add(key);
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ dataLines.sort(new Comparator<Float>() {
|
|
|
+ @Override
|
|
|
+ public int compare(Float o1, Float o2) {
|
|
|
+ return (int) (o1 - o2);
|
|
|
+ }
|
|
|
+ });
|
|
|
+ List<Float> rowKeys = new ArrayList<>(limitObject.keySet());
|
|
|
+ Collections.sort(rowKeys);
|
|
|
+ //获取行与行之间的数据以及补充修复单元格多行数据的情况
|
|
|
+ for (int i = 0; i < dataLines.size() ; i++) {
|
|
|
+ //获取在每行数据之间,并在表中的额外行之上的数据,补充到对应的单元格内
|
|
|
+ boolean isBreakTime = false;
|
|
|
+ for (Float rowKey : rowKeys) {
|
|
|
+ float nextLine = i + 1 == dataLines.size() ? pdfTable.getTableEndY() : dataLines.get(i + 1);
|
|
|
+ if (rowKey > dataLines.get(i) && rowKey < nextLine) {
|
|
|
+
|
|
|
+ PdfRow pdfRow = rowMap.get(rowKey);
|
|
|
+
|
|
|
+ if (pdfRow != null) {
|
|
|
+ List<PdfCell> rowCells = pdfRow.getCell();
|
|
|
+ if (rowCells != null && rowCells.size() > 0) {
|
|
|
+ List<Map<String, String>> addLines = new ArrayList<>();
|
|
|
+ for (PdfCell pdfCell : rowCells) {
|
|
|
+ //获取对应单元格所在的title
|
|
|
+ for (PdfCell field : headerRows.get(0).getCell()) {
|
|
|
+ if (pdfCell.getCellEndX() < field.getCellEndX() && pdfCell.getCellEndX() > field.getCellStartX()) {
|
|
|
+ Map<String, String> addMap = new HashMap<>();
|
|
|
+ addMap.put(field.getText(), pdfCell.getText());
|
|
|
+ addLines.add(addMap);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (extraTableLine != null) {
|
|
|
+ if (pdfTable.getExtraLinesBorder().contains(rowKey)) {
|
|
|
+ isBreakTime = true;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ }
|
|
|
+ if (isBreakTime) {
|
|
|
+ break;
|
|
|
+ } else if (addLines != null && addLines.size() > 0) {
|
|
|
+ JSONObject jsonObject = resultObject.get(dataLines.get(i));
|
|
|
+ for (Map<String, String> item : addLines) {
|
|
|
+ for (String key : item.keySet()) {
|
|
|
+ jsonObject.put(key, jsonObject.get(key) + " " + item.get(key));
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (Float key : resultObject.keySet()) {
|
|
|
+ if(!pdfTable.getExtraLinesBorder().contains(key)) {
|
|
|
+ result.add(resultObject.get(key));
|
|
|
+ }
|
|
|
}
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
- public static PdfTable getHeaderAndIgnoreLine(MultipartFile file, List<String> fields, List<String> ignoreFields, String endKey, int tableEndIndex, Map<String, List<PdfTextPosition>> documentPositions, String headerAlignment, String standardX, String headerYText) {
|
|
|
+ public static PdfTable getHeaderAndIgnoreLine(MultipartFile file, List<String> fields, List<String> ignoreFields, String endKey, int tableEndIndex, Map<String, List<PdfTextPosition>> documentPositions, String headerAlignment, String standardX, String headerYText, String[] extraLines, int headerLimit) {
|
|
|
PdfTable pdfTable = new PdfTable();
|
|
|
|
|
|
try {
|
|
|
InputStream inputStream = file.getInputStream();
|
|
|
PDDocument document = PDDocument.load(inputStream);
|
|
|
- PdfRow headerRow = new PdfRow();
|
|
|
+ Map<Float, PdfRow> headerRows = new HashMap<>();
|
|
|
PDFTextStripper textStripper = new PDFTextStripper() {
|
|
|
private int pageNumber = 0;
|
|
|
float width = 0;
|
|
|
float height = 0;
|
|
|
float standardXPosition = 0;
|
|
|
+ float rowY = 0;
|
|
|
+
|
|
|
@Override
|
|
|
protected void writePage() throws IOException {
|
|
|
pageNumber++;
|
|
@@ -549,69 +730,65 @@ public class PDFTableReader<T> {
|
|
|
@Override
|
|
|
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
|
|
//设置左侧空白偏移量
|
|
|
- if(text.contains(standardX)) {
|
|
|
+ if (text.contains(standardX)) {
|
|
|
pdfTable.setStandardPosition(textPositions.get(0).getX());
|
|
|
}
|
|
|
|
|
|
+
|
|
|
//重置textPositions各字段高度,加上每页页码*高度
|
|
|
List<PdfTextPosition> positions = new ArrayList<>();
|
|
|
- for(TextPosition textPosition : textPositions) {
|
|
|
+ for (TextPosition textPosition : textPositions) {
|
|
|
PdfTextPosition position = new PdfTextPosition();
|
|
|
position.setX(textPosition.getX());
|
|
|
position.setY(textPosition.getY() + (pageNumber - 1) * height);
|
|
|
position.setPageNum(pageNumber);
|
|
|
+ position.setFont(textPosition.getFont());
|
|
|
+ position.setFontSize(textPosition.getFontSize());
|
|
|
+ position.setUnicode(textPosition.getUnicode());
|
|
|
positions.add(position);
|
|
|
}
|
|
|
+ //寻找表格中各字段隔绝行
|
|
|
+ if (extraLines != null) {
|
|
|
+ List<Float> extraLinesBorder = pdfTable.getExtraLinesBorder() == null ? new ArrayList<>() : pdfTable.getExtraLinesBorder();
|
|
|
+ for (String extra : extraLines) {
|
|
|
+ if (text.contains(extra)) {
|
|
|
+ extraLinesBorder.add(positions.get(0).getY());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ pdfTable.setExtraLinesBorder(extraLinesBorder);
|
|
|
+ }
|
|
|
+
|
|
|
documentPositions.put(text + "-@@@-" + (positions.get(0).getX() + positions.get(0).getY()), positions);
|
|
|
//设置行高
|
|
|
- if(text.contains(headerYText)) {
|
|
|
- headerRow.setRowY(positions.get(0).getY());
|
|
|
+ if (text.contains(headerYText)) {
|
|
|
+ rowY = positions.get(0).getY();
|
|
|
+ PdfRow pdfRow = headerRows.get(rowY) == null ? new PdfRow() : headerRows.get(rowY);
|
|
|
+ pdfRow.setRowY(rowY);
|
|
|
+ headerRows.put(rowY, pdfRow);
|
|
|
}
|
|
|
//获取header所在行
|
|
|
- List<PdfCell> pdfCells = headerRow.getCell() == null ? new ArrayList<>() : headerRow.getCell();
|
|
|
|
|
|
+ PdfRow headerRow = null;
|
|
|
+ List<PdfCell> pdfCells = null;
|
|
|
for (String filed : fields) {
|
|
|
if (text.contains(filed)) {
|
|
|
+ headerRow = headerRows.get(positions.get(0).getY()) == null ? new PdfRow() : headerRows.get(positions.get(0).getY());
|
|
|
+ pdfCells = headerRow.getCell() == null ? new ArrayList<>() : headerRow.getCell();
|
|
|
PdfCell pdfCell = new PdfCell();
|
|
|
-// //处理表格头的单元格边界,根据表头的排列方式来决定单元格宽度
|
|
|
-// if (pdfCells != null && pdfCells.size() > 0) {
|
|
|
-// //居中对齐,字符串左右宽度一致,从第一个开始往后顺
|
|
|
-// if(headerAlignment.equals("mediate")) {
|
|
|
-// //获取前一个单元格
|
|
|
-// PdfCell prePdfCell = pdfCells.get(pdfCells.size() - 1);
|
|
|
-// float space = textPositions.get(0).getX() - prePdfCell.getCellEndX();
|
|
|
-// pdfCell.setCellStartX(prePdfCell.getCellEndX());
|
|
|
-// pdfCell.setCellEndX(textPositions.get(textPositions.size() - 1).getX() + space);
|
|
|
-// }else if(headerAlignment.equals("left")) {
|
|
|
-// //获取前一个单元格
|
|
|
-// PdfCell prePdfCell = pdfCells.get(pdfCells.size() - 1);
|
|
|
-// //设置前一个单元格的endx值为当前单元格的startx
|
|
|
-// prePdfCell.setCellEndX(textPositions.get(0).getX());
|
|
|
-// pdfCell.setCellStartX(textPositions.get(0).getX());
|
|
|
-// if(filed.equals(fields.get(fields.size() - 1))) {
|
|
|
-// pdfCell.setCellEndX(width);
|
|
|
-// }
|
|
|
-// }
|
|
|
-//
|
|
|
-// } else {
|
|
|
-// //设置第一个表头宽度
|
|
|
-// if(headerAlignment.equals("mediate")) {
|
|
|
-// pdfCell.setCellStartX(standardXPosition);
|
|
|
-// //单元格结束x值为起始x值 - 偏移量 + 结束x值
|
|
|
-// pdfCell.setCellEndX(textPositions.get(textPositions.size() - 1).getX() + textPositions.get(0).getX() - standardXPosition);
|
|
|
-// }else if(headerAlignment.equals("left")) {
|
|
|
-// pdfCell.setCellStartX(0);
|
|
|
-// }
|
|
|
-// }
|
|
|
//文本识别可能出错,会带多个字符,需识别标题在此文本中的位置并重新赋起始结束x值
|
|
|
int firstPosition = text.indexOf(filed);
|
|
|
int lastPosition = firstPosition + filed.length() - 1;
|
|
|
+ pdfCell.setCellStartX(positions.get(0).getX());
|
|
|
+ pdfCell.setCellEndX(positions.get(positions.size() - 1).getX());
|
|
|
pdfCell.setCellY(positions.get(0).getY());
|
|
|
- pdfCell.setHeaderText(filed);
|
|
|
+ pdfCell.setPdFont(textPositions.get(0).getFont());
|
|
|
+ pdfCell.setText(filed);
|
|
|
+ pdfCell.setFontSize(textPositions.get(0).getFontSize());
|
|
|
pdfCell.setCellStartX(textPositions.get(firstPosition).getX());
|
|
|
pdfCell.setCellEndX(textPositions.get(lastPosition).getX());
|
|
|
pdfCells.add(pdfCell);
|
|
|
-
|
|
|
+ headerRow.setCell(pdfCells);
|
|
|
+ headerRows.put(positions.get(0).getY(), headerRow);
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -626,24 +803,28 @@ public class PDFTableReader<T> {
|
|
|
pdfTable.setIgnoreRows(ignoreRows);
|
|
|
}
|
|
|
}
|
|
|
- headerRow.setCell(pdfCells);
|
|
|
-
|
|
|
- if(headerRow.getCell() != null && headerRow.getCell().size() > 0) {
|
|
|
- pdfTable.setTableStartY(headerRow.getCell().get(0).getCellY());
|
|
|
- //提供了表格结束行数据就用,不用就按给的字段模糊匹
|
|
|
- if (tableEndIndex == -1) {
|
|
|
- if(Strings.isBlank(endKey)) {
|
|
|
- pdfTable.setTableEndY(document.getNumberOfPages() * height);
|
|
|
- }else if (text.contains(endKey)) {
|
|
|
- pdfTable.setTableEndY(positions.get(0).getY());
|
|
|
- }
|
|
|
- } else {
|
|
|
- pdfTable.setTableEndY(tableEndIndex);
|
|
|
+
|
|
|
+
|
|
|
+ //提供了表格结束行数据就用,不用就按给的字段模糊匹
|
|
|
+ if (tableEndIndex == -1) {
|
|
|
+ if (Strings.isBlank(endKey)) {
|
|
|
+ pdfTable.setTableEndY(document.getNumberOfPages() * height);
|
|
|
+ } else if (text.contains(endKey)) {
|
|
|
+ pdfTable.setTableEndY(positions.get(0).getY());
|
|
|
}
|
|
|
+ } else {
|
|
|
+ pdfTable.setTableEndY(tableEndIndex);
|
|
|
}
|
|
|
+// if (headerRow != null && headerRow.getRowY() > 0) {
|
|
|
+// if (headerRow.getCell() != null && headerRow.getCell().size() > 0) {
|
|
|
+// pdfTable.setTableStartY(headerRow.getCell().get(0).getCellY());
|
|
|
+// }
|
|
|
+// // pdfTable.setHeaderRows(rows);
|
|
|
+// }
|
|
|
|
|
|
- pdfTable.setHeaderRow(headerRow);
|
|
|
}
|
|
|
+
|
|
|
+
|
|
|
};
|
|
|
|
|
|
textStripper.setSortByPosition(true);
|
|
@@ -652,13 +833,20 @@ public class PDFTableReader<T> {
|
|
|
textStripper.getText(document);
|
|
|
document.close();
|
|
|
inputStream.close();
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
+ List<PdfRow> rows = headerRows.values().stream().filter(item -> item.getRowY() > 0).collect(Collectors.toList());
|
|
|
+ rows.sort(new Comparator<PdfRow>() {
|
|
|
+ @Override
|
|
|
+ public int compare(PdfRow o1, PdfRow o2) {
|
|
|
+ return (int) (o1.getRowY() - o2.getRowY());
|
|
|
+ }
|
|
|
+ });
|
|
|
+ pdfTable.setHeaderRows(rows);
|
|
|
+ pdfTable.setTableStartY(rows.get(0).getRowY());
|
|
|
} catch (IOException e) {
|
|
|
- System.out.println("测试出问题了");
|
|
|
e.printStackTrace();
|
|
|
}
|
|
|
+
|
|
|
+
|
|
|
return pdfTable;
|
|
|
|
|
|
}
|
|
@@ -697,15 +885,15 @@ public class PDFTableReader<T> {
|
|
|
}
|
|
|
} catch (Exception e) {
|
|
|
e.printStackTrace();
|
|
|
- }finally {
|
|
|
- if(document != null){
|
|
|
+ } finally {
|
|
|
+ if (document != null) {
|
|
|
try {
|
|
|
document.close();
|
|
|
} catch (IOException e) {
|
|
|
e.printStackTrace();
|
|
|
}
|
|
|
}
|
|
|
- if(inputStream != null){
|
|
|
+ if (inputStream != null) {
|
|
|
try {
|
|
|
inputStream.close();
|
|
|
} catch (IOException e) {
|