去除PDF⽂件中的斜体⽂字⽔印
⽹上也有PDF去除⽔印的⽂章、⽅法、和代码,Github上也有,但是这些都是去除以图⽚为主的⽔印。⼀般情况下PDF⽔印均是斜体,印于⽂档的底部,按照Github或⽹上的⽂章根本⽆法去除,也不是⼀个正确的去除办法。这⾥要说的是⼀个正确去除⽔印并已经在实际运⾏的项⽬中使⽤的⽅法。
斜体⽔印并不是图⽚,因此不能通过检测PDF中的图⽚来删除⽔印。这种⽔印其实本⾝是⽂字,要⽤清除⽂字的⽅式来清除。主要思路是检测PDF中⽂字的倾斜度来检测⽔印,然后进⾏清除。下⾯给出源代码。
WatermarkScancer.java ⽔印检测类,⽤于检测PDF中的⽔印,并将检测到的⽂字保存到缓存中。
WatermarkRemover.java ⽔印清除类,⽤于清除PDF中的⽔印。
WatermarkProcessor.java ⽔印清除器类,⽤于执⾏任务。
本⽂采⽤并⾏处理,可处理多页PDF的去⽔印。
import java.io.OutputStream;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Vector;
import urrent.CompletableFuture;
import org.s.COSName;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
妇女节放假是法定的吗import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodelmon.PDStream;
public class WatermarkProcessor extends BaseWatermarkRemover implements IWatermarkProcessor {
@Override
public void init(PDDocument document) {
super.init(document);
// 扫描PDF⽂档,检查是否包含⽔印
CompletableFuture<Void> checkerTask = CompletableFuture.runAsync(() -> {
WatermarkChecker checker = new WatermarkChecker(WatermarkProcessor.this);
checker.run();
});
CompletableFuture.allOf(checkerTask).join();
// 扫描PDF⽂档,获取所有⽔印,如果超过3页,则启动多线程并⾏扫描
int threadCount = getThreadCount();
CompletableFuture<?>[] scancerTasks = new CompletableFuture<?>[threadCount];
for (int i = 0; i < threadCount; i++) {
final int pageStart = i * 3;
scancerTasks[i] = CompletableFuture.runAsync(() -> {
WatermarkScancer scancer = new WatermarkScancer(WatermarkProcessor.this, pageStart, 3);
scancer.run();
});
}
CompletableFuture.allOf(scancerTasks).join();
}
/**
* 清除⽔印的实现当超过3页时,本⽅法采⽤多线程执⾏,并⾏清除页⾯⽔印,以提⾼效率。
*/
@Override
public void removeWatermark() throws Exception {
int threadCount = getThreadCount();
CompletableFuture<?>[] removerTasks = new CompletableFuture<?>[threadCount];
final Vector<RemoveResult> removeResults = new Vector<>();
for (int i = 0; i < threadCount; i++) {
final int pageStart = i * 3;
removerTasks[i] = CompletableFuture.runAsync(() -> {
WatermarkRemover remover = new WatermarkRemover(WatermarkProcessor.this, pageStart, 3, null);
removeResults.PageTokens());
});
}
CompletableFuture.allOf(removerTasks).join();
// 对所有结果进⾏排序
Collections.sort(removeResults, new Comparator<RemoveResult>() {
@Override
public int compare(RemoveResult o1, RemoveResult o2) {
PageNo() - o2.getPageNo();
}
});
// 执⾏完毕后统⼀进⾏回写处理
for (RemoveResult result : removeResults) {
PDStream updatedStream = new PDStream(document);
OutputStream out = ateOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
判决和裁定的区别tokenWriter.Tokens());
out.close();
}
}
@Override
public void removeWatermark(List<String> watermarks) throws Exception {
int threadCount = getThreadCount();
CompletableFuture<?>[] removerTasks = new CompletableFuture<?>[threadCount];
final Vector<RemoveResult> removeResults = new Vector<>();
for (int i = 0; i < threadCount; i++) {
final int pageStart = i * 3;
removerTasks[i] = CompletableFuture.runAsync(() -> {
WatermarkRemover remover = new WatermarkRemover(WatermarkProcessor.this, pageStart, 3, watermarks); veWatermark();
removeResults.PageTokens());
});
}
CompletableFuture.allOf(removerTasks).join();
// 对所有结果进⾏排序
Collections.sort(removeResults, new Comparator<RemoveResult>() {
@Override
public int compare(RemoveResult o1, RemoveResult o2) {
PageNo() - o2.getPageNo();
}
});
// 执⾏完毕后统⼀进⾏回写处理
for (RemoveResult result : removeResults) {
PDStream updatedStream = new PDStream(document);
OutputStream out = ateOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.Tokens());
out.close();
}
}
private int getThreadCount() {
private int getThreadCount() {
return new NumberOfPages() / 3d)).intValue();
}
}
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.tstream.operator.Operator;
import org.s.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDPage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class WatermarkRemover {
台式机能无线上网吗Logger logger = Logger(WatermarkRemover.class);
IWatermarkProcessor remover;
List<RemoveResult> pageTokens = new ArrayList<>();
List<String> watermarks = null;
int pageStartIndex;
int pageLength;
public WatermarkRemover(IWatermarkProcessor remover, int pageStartIndex, int pageLength, List<String> watermarks) { ver = remover;
this.pageStartIndex = pageStartIndex;
this.pageLength = pageLength;
this.watermarks = watermarks;
}
public void removeWatermark() {
for (int i = pageStartIndex; i < pageStartIndex + pageLength; i++) {
if (i >= Document().getNumberOfPages()) {
break;
}
try {
processPage(i, Document().getPage(i));
} catch (Exception e) {
<("【解析PDF页⾯失败】", e);
}
}
}
public void processPage(int index, PDPage page) throws Exception {
Object next;
Operator op;
PDFStreamParser parser = new PDFStreamParser(page);
建设银行网银转账parser.parse();
List<?> tokens = Tokens();
if (Null(tokens)) {
for (int j = 0; j < tokens.size(); j++) {
next = (j);
if (Objects.isNull(next))
continue;
if (next instanceof Operator) {
op = (Operator) next;
if (op.getName().equals("Tj")) {
COSString previous = (COSString) (j - 1);
String string = String();
if (Utils.isISO8859_1Charset(string)) {
string = new Bytes("ISO8859-1"), "GBK");
}
// 判断是否是⽔印
if (null != watermarks && ains(string)) {
previous.setValue("".getBytes("GBK"));
} else if (remover.isWatermarkWord(string)) {
// 判断是否是⽔印
previous.setValue("".getBytes("GBK"));
}
}
}
}
}
RemoveResult pageResult = new RemoveResult(page, index, tokens); pageTokens.add(pageResult);
}
public List<RemoveResult> getPageTokens() {
return pageTokens;
}
static class RemoveResult {
PDPage page;
int pageNo;
List<?> tokens;
public RemoveResult(PDPage page, int pageNo, List<?> tokens) {
this.page = page;
this.pageNo = pageNo;
}
public PDPage getPage() {
return page;
}
污到你那里滴水不止的说说短篇public void setPage(PDPage page) {
this.page = page;
}
public int getPageNo() {
return pageNo;
}
public void setPageNo(int pageNo) {
剑法this.pageNo = pageNo;
}
public List<?> getTokens() {
return tokens;
}
public void setTokens(List<?> tokens) {
}
}
}
import org.tstream.PDFStreamEngine;
import org.tstream.operator.DrawObject;
import org.tstream.operator.DrawObject;
import org.tstream.operator.Operator;
import org.tstream.operator.state.Concatenate;
import org.tstream.operator.state.Restore;
import org.tstream.operator.state.Save;
import org.tstream.operator.state.SetGraphicsStateParameters;
import org.tstream.operator.state.SetMatrix;
import org.s.COSBase;
import org.s.COSString;
import org.apache.pdfbox.util.Matrix;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class WatermarkScancer extends PDFStreamEngine {
Logger logger = Logger(WatermarkScancer.class);
IWatermarkProcessor remover;
int pageStartIndex;
int pageLength;
public WatermarkScancer(IWatermarkProcessor remover, int pageStartIndex, int pageLength) {
addOperator(new Concatenate());
addOperator(new DrawObject());
addOperator(new SetGraphicsStateParameters());
addOperator(new Save());
addOperator(new Restore());
addOperator(new SetMatrix());
this.pageStartIndex = pageStartIndex;
this.pageLength = pageLength;
}
/**
* 开始扫描,检查所有⽔印
*/
public void run() {
try {
for (int i = pageStartIndex; i < pageStartIndex + pageLength; i++) {
if (i >= Document().getNumberOfPages()) {
break;
}
Document().getPage(i));
}
} catch (Exception e) {
<("【扫描页⾯⽔印出错】", e);
}
}
/**
* 处理读取的每⼀个点位
*/
@Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException { String operation = Name();
if ("Tj".equals(operation)) {
COSString textObj = (COSString) (0);
String string = String();
if (Utils.isISO8859_1Charset(string)) {
string = new Bytes("ISO8859-1"), "GBK");
}
// 检查是否是倾斜的⽔印
Matrix matrix = getTextLineMatrix();
if (matrix != null && ScaleY() != 0 && ScaleY() != 1 && ShearY() != 0) {
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论