引入相关jar包
org.jsoup
jsoup
1.12.1
com.alibaba
fastjson
1.2.83
org.apache.lucene
lucene-core
7.7.2
org.apache.lucene
lucene-queryparser
7.7.2
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Content {
private String title;
private String img;
private String price;
}
@Component
public class HtmlParseUtil {
public static void main(String[] args) throws IOException {
new HtmlParseUtil().parseJd("java").forEach(System.out::println);
}
public ArrayList parseJd(String keywords) throws IOException {
String url = "https://search.jd.com/Search?keyword="+keywords;
// 解析网页(Jsoup返回Document就是浏览器Document对象)
Document document = Jsoup.parse(new URL(url),30000);
// 所有你在js中可以使用的方法,这里都能用!
Element element = document.getElementById("J_goodsList");
// 获取所有的li元素
Elements elements = element.getElementsByTag("li");
ArrayList goodsList = new ArrayList();
for(Element el : elements){
String img = el.getElementsByTag("img")
.eq(0).attr("data-lazy-img");
String price = el.getElementsByClass("p-price").eq(0).text();
String title = el.getElementsByClass("p-name").eq(0).text();
Content content = new Content();
content.setTitle(title);
content.setImg(img);
content.setPrice(price);
goodsList.add(content);
}
return goodsList;
}
}
public Boolean parseContent(String keywords) throws IOException {
//1.采集数据
List contents = htmlParseUtil.parseJd(keywords);
if(contents.size()>0){
List documentList =new ArrayList();
for(Content content:contents){
//2.创建文档对象
Document document =new Document();
//创建域对象并且放入文档对象中
document.add(new TextField("title",content.getTitle(), Field.Store.YES));
document.add(new TextField("img",content.getImg(), Field.Store.YES));
document.add(new TextField("price",content.getPrice(), Field.Store.YES));
//将文档对象放入到文档集合中
documentList.add(document);
}
//3.创建分词器,StandardAnalyzer标准分词器,对英文分词效果好,对中文是单词分词,也就是一个字就认为是一个词。
Analyzer analyzer =new StandardAnalyzer();
//4.创建Directory目录对象,目录对象表示索引库的位置
Directory dir = FSDirectory.open(Paths.get("E:\\luceneJdDir"));
//5.创建IndexWriterConfig对象,这个对象中指定切分词使用的分词器
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
//6.创建IndexWriter输出流对象,指定输出的位置和使用的config初始化对象
IndexWriter indexWriter = new IndexWriter(dir,indexWriterConfig);
//7.写入文档到索引库
for(Document doc: documentList){
indexWriter.addDocument(doc);
}
//8.释放资源
indexWriter.close();
return true;
}
return false;
}