本篇内容介绍了“Java怎么实现Word/Pdf/TXT转html”的有关知识,在实际案例的操作过程中,不少人都会遇到这样的困境,接下来就让小编带领大家学习一下如何处理这些情况吧!希望大家仔细阅读,能够学有所成!
一:Java实现将word转换为html
1:引入依赖
1 <dependency>
2 <groupId>fr.opensagres.xdocreport</groupId>
3 <artifactId>fr.opensagres.xdocreport.document</artifactId>
4 <version>1.0.5</version>
5 </dependency>
6 <dependency>
7 <groupId>fr.opensagres.xdocreport</groupId>
8 <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
9 <version>1.0.5</version>
10 </dependency>
11 <dependency>
12 <groupId>org.apache.poi</groupId>
13 <artifactId>poi</artifactId>
14 <version>3.12</version>
15 </dependency>
16 <dependency>
17 <groupId>org.apache.poi</groupId>
18 <artifactId>poi-scratchpad</artifactId>
19 <version>3.12</version>
20 </dependency>
2:代码demo
1 package com.svse.controller;
2
3 import javax.xml.parsers.DocumentBuilderFactory;
4 import javax.xml.parsers.ParserConfigurationException;
5 import javax.xml.transform.OutputKeys;
6 import javax.xml.transform.Transformer;
7 import javax.xml.transform.TransformerException;
8 import javax.xml.transform.TransformerFactory;
9 import javax.xml.transform.dom.DOMSource;
10 import javax.xml.transform.stream.StreamResult;
11
12 import org.apache.poi.hwpf.HWPFDocument;
13 import org.apache.poi.hwpf.converter.PicturesManager;
14 import org.apache.poi.hwpf.converter.WordToHtmlConverter;
15 import org.apache.poi.hwpf.usermodel.PictureType;
16 import org.apache.poi.xwpf.converter.core.BasicURIResolver;
17 import org.apache.poi.xwpf.converter.core.FileImageExtractor;
18 import org.apache.poi.xwpf.converter.core.FileURIResolver;
19 import org.apache.poi.xwpf.converter.core.IURIResolver;
20 import org.apache.poi.xwpf.converter.core.IXWPFConverter;
21 import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
22 import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
23 import org.apache.poi.xwpf.usermodel.XWPFDocument;
24
27 public class TestWordToHtml {
28
29 public static final String STORAGEPATH="C://works//files//";
30 public static final String IP="192.168.30.222";
31 public static final String PORT="8010";
32 public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException {
33 TestWordToHtml wt=new TestWordToHtml();
34 //wt.Word2003ToHtml("甲骨文考证.doc");
35 wt.Word2007ToHtml("甲骨文考证.docx");
36
37 }
38
39
45 public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException {
46
47 final String imagepath = STORAGEPATH+"fileImage/";//解析时候如果doc文件中有图片 图片会保存在此路径
48 final String strRanString=getRandomNum();
49 String filepath =STORAGEPATH;
50 String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2003.html";
51 final String file = filepath + fileName;
52 InputStream input = new FileInputStream(new File(file));
53 HWPFDocument wordDocument = new HWPFDocument(input);
54 WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
55 //设置图片存放的位置
56 wordToHtmlConverter.setPicturesManager(new PicturesManager() {
57 public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
58 File imgPath = new File(imagepath);
59 if(!imgPath.exists()){//图片目录不存在则创建
60 imgPath.mkdirs();
61 }
62
63 File file = new File(imagepath +strRanString+suggestedName);
64 try {
65 OutputStream os = new FileOutputStream(file);
66 os.write(content);
67 os.close();
68 } catch (FileNotFoundException e) {
69 e.printStackTrace();
70 } catch (IOException e) {
71 e.printStackTrace();
72 }
73
74 return "http://"+IP+":"+PORT+"//uploadFile/fileImage/"+strRanString+suggestedName;
75 // return imagepath +strRanString+suggestedName;
76 }
77 });
78
79 //解析word文档
80 wordToHtmlConverter.processDocument(wordDocument);
81 Document htmlDocument = wordToHtmlConverter.getDocument();
82
83 File htmlFile = new File(filepath +strRanString+htmlName);
84 OutputStream outStream = new FileOutputStream(htmlFile);
85
86
87 DOMSource domSource = new DOMSource(htmlDocument);
88 StreamResult streamResult = new StreamResult(outStream);
89
90 TransformerFactory factory = TransformerFactory.newInstance();
91 Transformer serializer = factory.newTransformer();
92 serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
93 serializer.setOutputProperty(OutputKeys.INDENT, "yes");
94 serializer.setOutputProperty(OutputKeys.METHOD, "html");
95
96 serializer.transform(domSource, streamResult);
97 outStream.close();
98
99 System.out.println("生成html文件路径:"+ "http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);
100 }
101
102
106 public void Word2007ToHtml(String fileName) throws IOException {
107
108 final String strRanString=getRandomNum();
109
110 String filepath = STORAGEPATH+strRanString;
111 String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2007.html";
112 File f = new File(STORAGEPATH+fileName);
113 if (!f.exists()) {
114 System.out.println("Sorry File does not Exists!");
115 } else {
116 if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {
117 try {
118 // 1) 加载word文档生成 XWPFDocument对象
119 InputStream in = new FileInputStream(f);
120 XWPFDocument document = new XWPFDocument(in);
121
122 // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
123 File imageFolderFile = new File(filepath);
124 XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
125 options.setExtractor(new FileImageExtractor(imageFolderFile));
126 options.URIResolver(new IURIResolver() {
127 public String resolve(String uri) {
128 //http://192.168.30.222:8010//uploadFile/....
129 return "http://"+IP+":"+PORT+"//uploadFile/"+strRanString +"/"+ uri;
130 }
131 });
132
133 options.setIgnoreStylesIfUnused(false);
134 options.setFragment(true);
135
136 // 3) 将 XWPFDocument转换成XHTML
137 OutputStream out = new FileOutputStream(new File(filepath + htmlName));
138 IXWPFConverter<XHTMLOptions> converter = XHTMLConverter.getInstance();
139 converter.convert(document,out, options);
140 //XHTMLConverter.getInstance().convert(document, out, options);
141 System.out.println("html路径:"+"http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);
142 } catch (Exception e) {
143 e.printStackTrace();
144 }
145
146 } else {
147 System.out.println("Enter only MS Office 2007+ files");
148 }
149 }
150 }
151
152
158 public static String getRandomNum(){
159 Date dt = new Date();
160 SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
161 String str=sdf.format(dt);
162 return str;
163 }
164
165 }
二:Java实现将Pdf转换为html
1: 引入依赖
1 <dependency>
2 <groupId>net.sf.cssbox</groupId>
3 <artifactId>pdf2dom</artifactId>
4 <version>1.7</version>
5 </dependency>
6 <dependency>
7 <groupId>org.apache.pdfbox</groupId>
8 <artifactId>pdfbox</artifactId>
9 <version>2.0.12</version>
10 </dependency>
11 <dependency>
12 <groupId>org.apache.pdfbox</groupId>
13 <artifactId>pdfbox-tools</artifactId>
14 <version>2.0.12</version>
15 </dependency>
16
2:代码Demo
1 public class PdfToHtml {
2
3
6 public void pdfToHtmlTest(String inPdfPath,String outputHtmlPath) {
7 // String outputPath = "C:\\works\\files\\ZSQ保密知识测试题库.html";
8 9 //try() 写在()里面会自动关闭流
10 try{
11 BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)),"utf-8"));
12 //加载PDF文档
13 //PDDocument document = PDDocument.load(bytes);
14 PDDocument document = PDDocument.load(new File(inPdfPath));
15 PDFDomTree pdfDomTree = new PDFDomTree();
16 pdfDomTree.writeText(document,out);
17 } catch (Exception e) {
18 e.printStackTrace();
19 }
20 }
21
22 public static void main(String[] args) throws IOException {
23 PdfToHtml ph=new PdfToHtml();
24 String pdfPath="C:\\works\\files\\武研中心行政考勤制度.pdf";
25 String outputPath="C:\\works\\files\\武研中心行政考勤制度.html";
26 ph.pdfToHtmlTest(pdfPath,outputPath);
27 }
28
29 }
三:Java实现将TXT转换为html
1
6 public static void txtToHtml(String filePath, String htmlPosition) {
7 try {
8 //String encoding = "GBK";
9 File file = new File(filePath);
10 if (file.isFile() && file.exists()) { // 判断文件是否存在
11 InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK");
12 // 考虑到编码格式
13 BufferedReader bufferedReader = new BufferedReader(read);
14 // 写文件
15 FileOutputStream fos = new FileOutputStream(new File(htmlPosition));
16 OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK");
17 BufferedWriter bw = new BufferedWriter(osw);
18 String lineTxt = null;
19 while ((lineTxt = bufferedReader.readLine()) != null) {
20 bw.write("   "+lineTxt + "</br>");
21 }
22 bw.close();
23 osw.close();
24 fos.close();
25 read.close();
26 } else {
27 System.out.println("找不到指定的文件");
28 }
29 } catch (Exception e) {
30 System.out.println("读取文件内容出错");
31 e.printStackTrace();
32 }
33 }
“Java怎么实现Word/Pdf/TXT转html”的内容就介绍到这里了,感谢大家的阅读。如果想了解更多行业相关的知识可以关注编程网网站,小编将为大家输出更多高质量的实用文章!