package com.ssm.service.task;
import java.awt.Image;
import java.awt.image.RenderedImage;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import com.ssm.mapper.NewsMapper;
import com.ssm.po.News;
import com.ssm.utils.EncoderUtil;
import com.ssm.utils.FileUtil;
import com.ssm.utils.HttpUtil;
@Service("spider")
public class Spider {
@Autowired
NewsMapper newsMapper;
@Scheduled(fixedRate = 1000 * 1000)
public void getNotices() throws Exception {
List<News> newsList = getComputerNews();
News news = getDragonNews();
newsMapper.deleteAllNews();
if (news.getTitle() != null) {
newsMapper.insertNews(news);
}
newsMapper.insertNewsList(newsList);
}
public News getDragonNews() {
News news = new News();
int fileCount = 0;
// 获取标题
String dsUrl = "http://cs.scu.edu.cn/cs/xyxw/H9501index_1.htm";
dsUrl = HttpUtil.sendGet(dsUrl, "GBK");
Pattern pattern = Pattern
.compile("<A href=http://222.178.203.72:19005/whst/63/_bac1.05zrbtzdctzbm//ds2016 target=_blank>(.+?)</A>");
Matcher matcher = pattern.matcher(dsUrl);
if (matcher.find()) {
news.setTitle(matcher.group(1));
}
// 获取时间
pattern = Pattern
.compile("<DIV align=right><FONT size=2>.(.+?).</FONT>");
matcher = pattern.matcher(dsUrl);
if (matcher.find()) {// 设置日期
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd");// 设置日期格式
try {
// 将字符串转换成date类型
Date date = format.parse(matcher.group(1));
news.setTime(date);
System.out.println(date);
} catch (ParseException e) {
System.out.println("日期格式不正确");
e.printStackTrace();
}
}
// 获取内容
String dsUrl2 = "http://cbd2016.scu.edu.cn/ds2016/";
dsUrl2 = HttpUtil.sendGet(dsUrl2, "UTF-8");
pattern = Pattern
.compile("<span style=\"font-size: 18px;\"><span style=\"font-weight: bold;\">.+?</span><br>"
+ "</p><p style=\"margin-top: 0px; margin-bottom: 0px; word-spacing:"
+ " 0px; text-align: justify;\">.+?<span style=\"font-size: 18px; font-family:.+?</span>"
+ "</p><p style=\"margin-top: 0px; margin-bottom: 0px; word-spacing: 0px; text-align: justify;\">");
matcher = pattern.matcher(dsUrl2);
if (matcher.find()) {
System.out.println(matcher.group(0));
String contentStr = matcher.group(0).replaceAll(
"images/slide10.jpg", "http://cbd2016.scu.edu.cn/ds2016/");
// 添加页面头
contentStr = contentStr
.replace(
contentStr,
"<html><head> <base target=\"_blank\" /> <meta"
+ " http-equiv=\"Content-Type\" content=\"textml; charset=utf-8\"/>"
+ contentStr + "<ml>");
// 获取图片
pattern = Pattern.compile("images/slide.+?.jpg");
matcher = pattern.matcher(dsUrl2);
if (matcher.find()) {
news.setPic("http://cbd2016.scu.edu.cn/ds2016/"
+ matcher.group(0));
}
// 获取相对路径
String relativelyPath = System.getProperty("b2cweb.root");
relativelyPath = relativelyPath.replace("\\", "/");
int index = relativelyPath.indexOf(".");
relativelyPath = relativelyPath.substring(0, index);
String filePaths = relativelyPath
+ "SCU_News_Notice/WebContent/html/computer/news/"
+ fileCount + ".html";
// System.out.println(contentStr);
FileUtil.writeIntoFile(contentStr, filePaths, false);
news.setContent("http://localhost:8080/SCU_News_Notice/html/computer/news/"
+ fileCount + ".html");
}
// news.setContent("龙星计划");
news.setAddress("http://cbd2016.scu.edu.cn/ds2016/");
news.setAcademyId(1);
return news;
}
public List<News> getComputerNews() {
// 存放新闻详细信息页
List<String> links = new ArrayList<String>();
int count = 1;
// http://cs.scu.edu.cn/cs/xytz/H9502index_1.htm
String htmlStr = HttpUtil.sendGet(
"http://cs.scu.edu.cn/cs/xyxw/H9501index_" + count + ".htm",
"GBK");
// 转换成utf-8编码格式
// htmlStr = EncoderUtil.getUTF8StringFromGBKString(htmlStr);
// 使用正则表达式匹配详细地址的url
Pattern pattern = Pattern
.compile("<A href=http://222.178.203.72:19005/whst/63/=cnvmknZczbrcmzmds//cs/xyxw/webinfo(.+?) target=_blank>");
Matcher matcher = pattern.matcher(htmlStr);
boolean isFind = matcher.find();
while (isFind) {// 如果有匹配的通知详细地址,继续循环
while (matcher.find()) {
links.add("http://cs.scu.edu.cn/cs/xyxw/webinfo"
+ matcher.group(1));
System.out.println("text" + matcher.group(1));
}
count++;
htmlStr = HttpUtil
.sendGet("http://cs.scu.edu.cn/cs/xyxw/H9501index_" + count
+ ".htm", "GBK");
// htmlStr = EncoderUtil.getUTF8StringFromGBKString(htmlStr);
matcher = pattern.matcher(htmlStr);
isFind = matcher.find();
}
// 存放通知具体信息
List<News> newsList = new ArrayList<News>();
int fileCount = 1;
for (String link : links) {
News news = new News();
htmlStr = HttpUtil.sendGet(link, "GBK");
htmlStr = EncoderUtil.getUTF8StringFromGBKString(htmlStr);
// 获取标题
pattern = Pattern.compile("<DIV align=center> (.+?)</DIV>");
matcher = pattern.matcher(htmlStr);
if (matcher.find()) {// 设置标题
news.setTitle(matcher.group(1));
}
// 获取日期
pattern = Pattern.compile("</SPAN> ([0-9].+?)<SPAN class=hangjc "
+ "style=\"LINE-HEIGHT: 30px\" valign=\"bottom\">");
matcher = pattern.matcher(htmlStr);
if (matcher.find()) {// 设置日期
SimpleDateFormat format = new SimpleDateFormat(
"yyyy-MM-dd HH:mm");// 设置日期格式
try {
// 将字符串转换成date类型
Date date = format.parse(matcher.group(1));
news.setTime(date);
System.out.println(date);
} catch (ParseException e) {
System.out.println("日期格式不正确");
e.printStackTrace();
}
}
// 获取内容
Document doc = Jsoup.parse(htmlStr);
Element contentEle = doc.getElementById("BodyLabel");
String contentStr = contentEle.toString();
// 添加图片的完整路径
contentStr = contentStr.replaceAll("src=\"",
"src=\"http://cs.scu.edu.cn");
// 添加页面头
contentStr = contentStr
.replace(
contentStr,
"<html><head> <base target=\"_blank\" /> <meta"
+ " http-equiv=\"Content-Type\" content=\"textml; charset=utf-8\"/>"
+ contentStr + "<ml>");
// 获取相对路径
String relativelyPath = System.getProperty("b2cweb.root");
relativelyPath = relativelyPath.replace("\\", "/");
int index = relativelyPath.indexOf(".");
relativelyPath = relativelyPath.substring(0, index);
String filePaths = relativelyPath
+ "SCU_News_Notice/WebContent/html/computer/news/"
+ fileCount + ".html";
// System.out.println(contentStr);
FileUtil.writeIntoFile(contentStr, filePaths, false);
news.setContent("http://localhost:8080/SCU_News_Notice/html/computer/news/"
+ fileCount + ".html");
// 获取图片
Elements images = contentEle.getElementsByTag("img");
String[] imageUrls = new String[images.size()];
for (int i = 0; i < imageUrls.length; i++) {
imageUrls[i] = images.get(i).attr("src");
news.setPic(imageUrls[i]);
}
news.setAcademyId(1);
news.setAddress(link);
newsList.add(news);
fileCount++;
}
System.out.println(links.size());
System.out.println(newsList.size());
return newsList;
}
}