该代码简易实现了获取URL地址后对文档进行关键字统计的功能。具体的自己看吧
1.实现URL文档的拷贝
import java.util.Scanner;
import java.util.regex.Pattern;
import java.net.*;
import java.io.*;
import javax.swing.*;
import javax.swing.UIManager;
import java.awt.*;
import javax.swing.plaf.FontUIResource;
public class TestURL {
static String getUserKeyWords=null; //获取用户选择的关键词
public static void main(String[] args) {
File copyfile=new File("D:/newTest.txt");
InputStream in=null;
BufferedReader br=null; //字符流写入
BufferedWriter out=null; //字符流写出
String urladdress=null; //获取用户输入的URL地址
try
{
UIManager.put("JOptionPane.messageFont",new FontUIResource(new Font("宋体",Font.BOLD,20)));
String getUserURL=JOptionPane.showInputDialog(null,"URL地址:\n","输入URL地址",JOptionPane.PLAIN_MESSAGE);
String urlAddr=getUserURL.substring(getUserURL.lastIndexOf("/"));
copyfile=new File("D:/"+urlAddr);
getUserKeyWords=JOptionPane.showInputDialog(null,"关键字查询:\n","关键字",JOptionPane.PLAIN_MESSAGE);
//URL url=new URL("http://news.cctv.com/2019/06/19/ARTIhqziOpWz2COTyHFW063b190619.shtml"); //获取URL地址
URL url=new URL(getUserURL); //获取URL地址
HttpURLConnection urlC=(HttpURLConnection)url.openConnection(); //由URL获取URLConnection对象
in=urlC.getInputStream(); //获取urlC的输入流
br=new BufferedReader(new InputStreamReader(in,"UTF-8")); //将url默认的字节流转成字符流,并以UTF-8的格式写入文档
out=new BufferedWriter(new FileWriter(copyfile)); //将获取的信息写入到TestURL文档中
String length=null;
while ((length=br.readLine())!=null)
{
out.write(Html2Text(length));
out.newLine();
}
}
catch (Exception e)
{
e.getMessage();
}finally{
System.out.println("拷贝完成!");
try{
if (in!=null){in.close();}
if (out!=null){out.close();}
if (br!=null){br.close();}
}catch(Exception ee){
ee.getMessage();
}
}
TextFileSearch search = new TextFileSearch();
search.SearchKeyword(copyfile, getUserKeyWords);
} //程序到这就结束了 ,下面是不同方法实现对html的剔除功能,可以忽略
//从html中提取纯文本 ,这部分其实没什么用,最开始想截取html中的字符串,后面检查也没啥用,就没删除,保留着
public static String Html2Text(String inputString) {
String htmlStr = inputString; // 含html标签的字符串
String textStr = "";
java.util.regex.Pattern p_script;
java.util.regex.Matcher m_script;
java.util.regex.Pattern p_style;
java.util.regex.Matcher m_style;
java.util.regex.Pattern p_html;
java.util.regex.Matcher m_html;
try {
String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script>
String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; // 定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style>
String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式
p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(""); // 过滤script标签
p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); // 过滤style标签
p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(""); // 过滤html标签
textStr = htmlStr;
} catch (Exception e) {System.err.println("Html2Text: " + e.getMessage()); }
//剔除空格行
textStr=textStr.replaceAll("[ ]+", " ");
textStr=textStr.replaceAll("(?m)^\\s*$(\\n|\\r\\n)", "");
return textStr;// 返回文本字符串
}
}
2.实现关键词在文档的查询功能
import java.io.Closeable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import javax.swing.UIManager;
import javax.swing.*;
import java.awt.*;
import javax.swing.plaf.FontUIResource;
public class TextFileSearch {
TestURL tt;
public void SearchKeyword(File file,String keyword) {
//参数校验
verifyParam(file, keyword);
//行读取
LineNumberReader lineReader = null;
try {
lineReader = new LineNumberReader(new FileReader(file));
String readLine = null;
int times = 0;//出现的次数
while((readLine =lineReader.readLine()) != null){
//判断每一行中,出现关键词的次数
int index = 0; //获得readLine的对象值
int next = 0; //定义开始查找关键字的序列号
//int times = 0;//出现的次数
//判断次数
while((index = readLine.indexOf(keyword,next)) != -1) { //从每行的第0个索引开始遍历关键字
next = index + keyword.length(); //下一次的遍历序号为序列号+关键字长度
times++;//次数加1
}
}
if (times>0)
{
UIManager.put("JOptionPane.messageFont",new FontUIResource(new Font("宋体",Font.BOLD,20)));
JOptionPane.showMessageDialog(null,"关键字"+"@"+tt.getUserKeyWords+"@"+"共有"+times+"个");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
//关闭流
close(lineReader);
}
}
private void verifyParam(File file, String keyword) {
//对参数进行校验证
if(file == null ){
throw new NullPointerException("the file is null");
}
if(keyword == null || keyword.trim().equals("")){
throw new NullPointerException("the keyword is null or \"\" ");
}
if(!file.exists()) {
throw new RuntimeException("the file is not exists");
}
//非目录
if(file.isDirectory()){
throw new RuntimeException("the file is a directory,not a file");
}
//可读取
if(!file.canRead()) {
throw new RuntimeException("the file can't read");
}
}
private void close(Closeable able){
if(able != null){
try {
able.close();
} catch (IOException e) {
e.printStackTrace();
able = null;
}
}
}
}
3.显示效果
URL地址获取效果图
关键字查询界面
查询后效果图
到此这篇关于Java实现统计文档中关键字出现的次数的文章就介绍到这了,更多相关Java关键字次数内容请搜索编程网以前的文章或继续浏览下面的相关文章希望大家以后多多支持编程网!