Spire.pdf-使⽤学习记录
spire.pdf 使⽤学习记录
背景
通过打印机将⼀本纸质书转为pdf的格式,以下所有操作都是在这个基础上操作,最终⽬的⽣成n篇txt⽂件,每个txt名字规则是:起始页_篇&章&节 内容:是对应切割的内容;
简介
这是⼀款基于OCR框架的解析⼯具,拥有⽐价完整的Java 类库,及完善的API⽂档,不但具备pdf的读写,还⽀持⽂本和图⽚的提取,⽔印的添加,书签的增删改,表格的操作,同时还⽀持,将pdf转化成word、HTML、XPS、SVG、等多种⽅式;当然⽬前这类⼯具市⾯上有很多,不过通过⽐较最后选择了spire.pdf ;⽬前这款项⽬共有两个版本,⼀个是免费版本⼀个是付费版本,免费版本如果只是处理简单的pdf是没问题的,但是如果涉及到输出为pdf则会只显⽰前10页,第⼗⼀页则是预定的购买页介绍,不过介于spire.pdf的完善性我最后还是选择了他,⾄于10页的问题,后⾯会拿出我的处理办法,如果你是其他语⾔如.NET、Android、也可以使⽤这款产品;
相似产品及特点
PDFBox
itext
百度
Tesseract 只能⽤于识别图⽚,如果需要先将pdf转为图⽚
官⽹地址
功能实现
前往官⽹下载jar包或者直接在maven上通过坐标也可实现,不过maven上的肯定没有⽹页下载的新;
<!-- 事先配置maven仓库路径-->
<repositories>
<repository>
<id>com.e-iceblue</id>
<url>repo.e-iceblue/repository/maven-public/</url>
</repository>
</repositories>
<!--maven中 spire.pdf镜像依赖-->
<dependencies>
<dependency>
getsavefilename<groupId> e-iceblue </groupId>
<artifactId>spire.pdf</artifactId>
<version>3.11.6</version>
</dependency>
</dependencies>
切割pdf
按每页切割
/**
*每⼀页⽣成⼀个pdf⽂档
*/
public static void splitPdfOneByOne(){
PdfDocument pdf =new PdfDocument();
int count = Pages().getCount();
System.out.println(count);
pdf.loadFromFile("C:\\Users\\wangchenchen\\Desktop\\boot-structure\\book.pdf");
pdf.split("C:\\Users\\wangchenchen\\Desktop\\boot-structure\\output\\surgery_{0}.pdf",0);
pdf.close();
}
按指定页切割
package;
import PdfDocument;
import PdfPageBase;
import PdfMargins;
import FileUtil;
import Point2D;
import File;
/**
* 拆分⽂档
* 拆成每页⼀个
* 拆成每九页⼀个(免费版最多只能处理九页)
* @author wangcc
* @createTime 2021年08⽉31⽇ 23:25:00
*/
public class SubPDF {
/**
* 每九页⽣成⼀个pdf
**/
public static void splitPdfMoreByOne(){
String fileName ="C:\\Users\\wangchenchen\\Desktop\\boot-structure\\book.pdf";
String outPath ="C:\\Users\\wangchenchen\\Desktop\\boot-structure\\outFile\\outPDFByMore";        PdfDocument pdf =new PdfDocument();
pdf.loadFromFile(fileName);
int totalCount = Pages().getCount();
PdfPageBase pageBase;
PdfDocument document =new PdfDocument();
int count =1;
for(int i =41; i<822;i++){
System.out.println(i+"/"+822);
pageBase = Pages().Pages().get(i).getSize(),new PdfMargins(0));            Pages().get(i).createTemplate().draw(pageBase,new Point2D.Float(0,0));
if(count %9==0){
String path ="\\splitPdf-"+i+".pdf";
document.saveToFile(outPath+path);
document =new PdfDocument();
}
count++;
}
Pages().getCount()>0){
String path ="\\splitPdf-999999.pdf";
document.saveToFile(outPath+path);
}
}
/**
* @Description //
* @return void
**/
public static void splitPdfByNumber(Integer begin, Integer end,String filePath,String pdfOutPath){
public static void splitPdfByNumber(Integer begin, Integer end,String filePath,String pdfOutPath){
if(begin.equals("")|| end.equals("")|| filePath.equals("")|| pdfOutPath.equals("")){
System.out.println("传⼊参数有空.......");
return;
}
if(begin >= end){
System.out.println("截⽌页数不能⼩于或等于开始页数.......");
return;
}
if(end-begin >9){
System.out.println("操作页数最多为9页");
return;
}
File file =new File(pdfOutPath);
ists()){
FileUtil fileUtil =new FileUtil();
DeleteFolder(pdfOutPath);
}
PdfDocument pdf =new PdfDocument();
pdf.loadFromFile(filePath);
int totalCount = Pages().getCount();
PdfPageBase pageBase;
PdfDocument document =new PdfDocument();
for(int i = begin; i< end;i++){
System.out.println(i+"/"+end);
pageBase = Pages().Pages().get(i).getSize(),new PdfMargins(0));
String path ="\\surgery_"+i+".pdf";
document.saveToFile(pdfOutPath+path);
document =new PdfDocument();
}
}
}
将pdf转成txt
package;
import PdfDocument;
import PdfPageBase;
import*;
import*;
import Matcher;
import Pattern;
/**
* 读取所有拆分⽂件⽣成txt⽂件
* @author wangcc
* @createTime 2021年08⽉31⽇ 23:51:00
*/
public class ReadAllSplitFile {
public static String fileName ="C:\\Users\\wangchenchen\\Desktop\\boot-structure\\outFile\\outPDFByMore\\"; public static String outPath ="C:\\Users\\wangchenchen\\Desktop\\boot-structure\\outFile\\";
public static void main(String[] args){
List<File> fileList =readAllFile();
List<String> pdfFileNameList =new ArrayList<>();
for(File file:fileList){
pdfFileNameList.Name());
}
Collections.sort(pdfFileNameList,new Comparator<String>(){
@Override
@Override
public int compare(String o1, String o2){
int n1 =extractNumber(o1);
int n2 =extractNumber(o2);
return n1 - n2;
}
});
File file =new File(outPath);
ists()){
file.delete();
}
for(String s:pdfFileNameList){
try{
readFile(s);
}catch(IOException e){
e.printStackTrace();
}
}
}
public static List<File>readAllFile(){
String filePath ="C:\\Users\\wangchenchen\\Desktop\\boot-structure\\outFile\\outPDFByMore";
ArrayList<File> fileList =new ArrayList<>();
File file =new File(filePath);
File[] files = file.listFiles();
if(Objects.isNull(files)){
return null;
}
for(File f:files){
if(f.isFile()){
fileList.add(f);
}
}
return fileList;
}
/**
* @Param  orderStr 排序:asc,des,不区分⼤⼩写
**/
public static List<File>sortFileByName(List<File> fileList,final String orderStr){
if(!orderStr.equalsIgnoreCase("asc")&&!orderStr.equalsIgnoreCase("desc")){
return fileList;
}
File[] files = Array(new File[0]);
Arrays.sort(files,new Comparator<File>(){
@Override
public int compare(File o1, File o2){
int n1 =Name());
int n2 =Name());
if(orderStr ==null|| orderStr.length()<1|| orderStr.equalsIgnoreCase("asc")){
return n1 - n2;
}else{
//降序
return n2 - n1;
}
}
});
return new ArrayList<File>(Arrays.asList(files));
}
public static int extractNumber(String name){
int i;
try{
try{
String s = placeAll("[^\\d]","");
i = Integer.parseInt(s);
}catch(Exception e){
i =0;
}
return i;
}
public static void readFile(String path)throws IOException {
PdfDocument pdf =new PdfDocument();
pdf.loadFromFile(fileName+path);
PdfPageBase page;
StringBuilder sb =new StringBuilder();
Pattern pattern = Patternpile("(^(\\s*)第)(.{1,9})[章节卷集部篇回](\\s{1,10})(.{1,20})(\\s{1,10})");
Pattern pattern1 = Patternpile("(\\s{0,10})([0-9][0-9]?[0-9]?[0-9]?)");
//遍历PDF页⾯,获取每个页⾯的⽂本并添加到StringBuilder对象
for(int i =0;i < Pages().getCount();i++){
//System.out.println("循环遍历pdf页数:当前" + i + "页/" + Pages().getCount() + "页");
page = Pages().get(i);
int count =0;
String extractText =null;
BufferedReader br =new BufferedReader(new InputStreamReader(new actText(true).getBytes())));
while((extractText = br.readLine())!=null){
Matcher matcher = pattern.matcher(extractText);
Matcher matcher1 = pattern1.matcher(extractText);
/*末尾包含数字的*/
if(count !=0|| matcher.find()){
//System.out.println(extractText);
if(!extractText.equals("")&&!matcher1.find()){
String s = placeAll("\\s{5,9}"," ");
sb.append(s+"\n");
}
}
count++;
}
br.close();
}
FileWriter writer;
try{
//将StringBuilder对象中的⽂本写⼊到⽂本⽂件
writer =new FileWriter(outPath,true);
System.out.String());
writer.String());
writer.flush();
writer.close();
sb.delete(0,sb.length());
}catch(IOException e){
e.printStackTrace();
}
pdf.close();
}
}
识别pdf中的篇章节⽣成对应⽂本
去除⽔印