说明
开发的系统中,高校信息的作用是个人选的教育经历打上标签,方便筛选优质人才。 传统一些的方式就是985/211,但是现在主流的方式是双一流。尽管如此,用985/211来筛选也存在很大程度上的合理性,所以从多个角度都进行了查询。
数据来源方面,使用了阳关高考网和中国教育在线两个网站。 阳关高考网:https://gaokao.chsi.com.cn/sch/search.do 中国教育在线:https://daxue.eol.cn/ 查询流程: 1 通过阳关高考网选择民办大学,将所有民办大学筛选出来 2 通过中国教育在线筛选出985和211院校 3 查询阳光高考网数据列表,每个学校都判断是否是民办or985/211,打上标签
技术说明开发语言选择的是Java,pom.xml中引入了jsoup用来网络请求
org.jsoup
jsoup
1.12.1
高校表结构设计
直接将数据内容保存即可,单一表
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.Id;
/**
* 高校信息
*/
@Entity
public class University {
@Id
@GeneratedValue
private Long zid;
/** 创建时间 **/
@Column(length=19, nullable=false)
private String createTime;
/** 更新时间 **/
@Column(length=19)
private String updateTime;
/** 是否删除 1是 0否 **/
@Column(nullable=false)
private Integer isDelete;
/** 院校名称 **/
@Column(length=300, nullable=false)
private String name;
/** 院校所在地 **/
@Column(length=300, nullable=false)
private String city;
/** 院校隶属 **/
@Column(length=300, nullable=false)
private String owner;
/** 学历层次 **/
@Column(length=100, nullable=false)
private String level;
/** 院校特性 985|211、211、民办大学 **/
@Column(length=100)
private String feature;
/** 一流大学 1是 0否 **/
@Column(nullable=false)
private Integer leadingUniversity;
/** 一流学科 1是 0否 **/
@Column(nullable=false)
private Integer leadingDisciplines;
/** 是否有研究生院 1有 0无 **/
@Column(nullable=false)
private Integer institute;
public Integer getLeadingUniversity() {
return leadingUniversity;
}
public void setLeadingUniversity(Integer leadingUniversity) {
this.leadingUniversity = leadingUniversity;
}
public Integer getLeadingDisciplines() {
return leadingDisciplines;
}
public void setLeadingDisciplines(Integer leadingDisciplines) {
this.leadingDisciplines = leadingDisciplines;
}
public Long getZid() {
return zid;
}
public void setZid(Long zid) {
this.zid = zid;
}
public String getCreateTime() {
return createTime;
}
public void setCreateTime(String createTime) {
this.createTime = createTime;
}
public String getUpdateTime() {
return updateTime;
}
public void setUpdateTime(String updateTime) {
this.updateTime = updateTime;
}
public Integer getIsDelete() {
return isDelete;
}
public void setIsDelete(Integer isDelete) {
this.isDelete = isDelete;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public String getOwner() {
return owner;
}
public void setOwner(String owner) {
this.owner = owner;
}
public String getLevel() {
return level;
}
public void setLevel(String level) {
this.level = level;
}
public String getFeature() {
return feature;
}
public void setFeature(String feature) {
this.feature = feature;
}
public Integer getInstitute() {
return institute;
}
public void setInstitute(Integer institute) {
this.institute = institute;
}
}
辅助方法
用来判断某个学校是否是985/211,之所以没直接对比学校名称,是考虑到如果某学校是985/211,那么分校也理应是985/211。
private boolean schoolContain(String name, List school) {
boolean contain =false;
for(String item : school) {
if(name.contains(item)) {
contain = true;
break;
}
}
return contain;
}
代码流程
获取985院校列表
//985院校
String url = "https://daxue.eol.cn/985.shtml";
Document doc = Jsoup.connect(url).get();
List school985 = new ArrayList();
Elements eles = doc.getElementsByTag("tbody").first().getElementsByTag("tr");
for(Element ele : eles) {
Elements items = ele.getElementsByTag("td");
school985.add(items.get(items.size()-3).text());
}
获取211院校列表
//211院校
url = "https://daxue.eol.cn/211.shtml";
doc = Jsoup.connect(url).get();
List school211 = new ArrayList();
eles = doc.getElementsByTag("tbody").first().getElementsByTag("tr");
for(Element ele : eles) {
Elements items = ele.getElementsByTag("td");
school211.add(items.get(items.size()-3).text());
}
获取民办大学列表
// 民办大学
url = "https://gaokao.chsi.com.cn/sch/search.do?searchType=1&yxmc=&ssdm=&yxls=&xlcc=&yxjbz=2";
doc = Jsoup.connect(url).get();
eles = doc.getElementById("PageForm").getElementsByTag("li");
Element ele = eles.get(eles.size() - 3);
int count = Integer.parseInt(ele.text());
List minbanSchoolName = new ArrayList();
for (int i = 0; i < count; i++) {
url = "https://gaokao.chsi.com.cn/sch/search--searchType-1,yxjbz-2,start-" + (i * 20) + ".dhtml";
doc = Jsoup.connect(url).get();
eles = doc.getElementsByClass("ch-table").first().getElementsByTag("tr");
for (int j = 1, num = eles.size(); j < num; j++) {
Elements items = eles.get(j).getElementsByTag("td");
minbanSchoolName.add(items.get(0).text());
}
Thread.sleep(2000);
}
查询高校列表并保存到数据库
// 查询列表数据
url = "https://gaokao.chsi.com.cn/sch/search.do";
doc = Jsoup.connect(url).get();
eles = doc.getElementById("PageForm").getElementsByTag("li");
ele = eles.get(eles.size() - 3);
count = Integer.parseInt(ele.text());
for (int i = 0; i < count; i++) {
url = "https://gaokao.chsi.com.cn/sch/search--start-" + (i * 20) + ".dhtml";
doc = Jsoup.connect(url).get();
eles = doc.getElementsByClass("ch-table").first().getElementsByTag("tr");
for (int j = 1, num = eles.size(); j < num; j++) {
Elements items = eles.get(j).getElementsByTag("td");
University school = new University();
school.setCity(items.get(1).text());
school.setCreateTime(Dates.now());
school.setLeadingUniversity(StringUtil.isBlank(items.get(4).text())?0:1);
school.setLeadingDisciplines(StringUtil.isBlank(items.get(5).text())?0:1);
if(minbanSchoolName.contains(items.get(0).text())) {
school.setFeature("民办");
} else {
if(schoolContain(items.get(0).text(), school985)) {
school.setFeature("985|211");
} else if(schoolContain(items.get(0).text(), school211)) {
school.setFeature("211");
}
}
school.setInstitute(StringUtil.isBlank(items.get(6).text())?0:1);
school.setIsDelete(0);
school.setLevel(items.get(3).text());
school.setName(items.get(0).text());
school.setOwner(items.get(2).text());
schoolRepository.save(school);
}
Thread.sleep(2000);
}
完整测试类
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import com.lootaa.xcl.basedata.dao.UniversityRepository;
import com.lootaa.xcl.basedata.db.University;
import com.lootaa.xcl.basedata.util.Dates;
@SpringBootTest
class XclBasedataApplicationSchoolTests {
@Autowired UniversityRepository schoolRepository;
@Test
void loadSchool() throws Exception {
schoolRepository.deleteAll();
//985院校
String url = "https://daxue.eol.cn/985.shtml";
Document doc = Jsoup.connect(url).get();
List school985 = new ArrayList();
Elements eles = doc.getElementsByTag("tbody").first().getElementsByTag("tr");
for(Element ele : eles) {
Elements items = ele.getElementsByTag("td");
school985.add(items.get(items.size()-3).text());
}
//211院校
url = "https://daxue.eol.cn/211.shtml";
doc = Jsoup.connect(url).get();
List school211 = new ArrayList();
eles = doc.getElementsByTag("tbody").first().getElementsByTag("tr");
for(Element ele : eles) {
Elements items = ele.getElementsByTag("td");
school211.add(items.get(items.size()-3).text());
}
// 民办大学
url = "https://gaokao.chsi.com.cn/sch/search.do?searchType=1&yxmc=&ssdm=&yxls=&xlcc=&yxjbz=2";
doc = Jsoup.connect(url).get();
eles = doc.getElementById("PageForm").getElementsByTag("li");
Element ele = eles.get(eles.size() - 3);
int count = Integer.parseInt(ele.text());
List minbanSchoolName = new ArrayList();
for (int i = 0; i < count; i++) {
url = "https://gaokao.chsi.com.cn/sch/search--searchType-1,yxjbz-2,start-" + (i * 20) + ".dhtml";
doc = Jsoup.connect(url).get();
eles = doc.getElementsByClass("ch-table").first().getElementsByTag("tr");
for (int j = 1, num = eles.size(); j < num; j++) {
Elements items = eles.get(j).getElementsByTag("td");
minbanSchoolName.add(items.get(0).text());
}
Thread.sleep(2000);
}
// 查询列表数据
url = "https://gaokao.chsi.com.cn/sch/search.do";
doc = Jsoup.connect(url).get();
eles = doc.getElementById("PageForm").getElementsByTag("li");
ele = eles.get(eles.size() - 3);
count = Integer.parseInt(ele.text());
for (int i = 0; i < count; i++) {
url = "https://gaokao.chsi.com.cn/sch/search--start-" + (i * 20) + ".dhtml";
doc = Jsoup.connect(url).get();
eles = doc.getElementsByClass("ch-table").first().getElementsByTag("tr");
for (int j = 1, num = eles.size(); j < num; j++) {
Elements items = eles.get(j).getElementsByTag("td");
University school = new University();
school.setCity(items.get(1).text());
school.setCreateTime(Dates.now());
school.setLeadingUniversity(StringUtil.isBlank(items.get(4).text())?0:1);
school.setLeadingDisciplines(StringUtil.isBlank(items.get(5).text())?0:1);
if(minbanSchoolName.contains(items.get(0).text())) {
school.setFeature("民办");
} else {
if(schoolContain(items.get(0).text(), school985)) {
school.setFeature("985|211");
} else if(schoolContain(items.get(0).text(), school211)) {
school.setFeature("211");
}
}
school.setInstitute(StringUtil.isBlank(items.get(6).text())?0:1);
school.setIsDelete(0);
school.setLevel(items.get(3).text());
school.setName(items.get(0).text());
school.setOwner(items.get(2).text());
schoolRepository.save(school);
}
Thread.sleep(2000);
}
}
private boolean schoolContain(String name, List school) {
boolean contain =false;
for(String item : school) {
if(name.contains(item)) {
contain = true;
break;
}
}
return contain;
}
}
保存的数据