这是一个目前在做的项目需要使用的xml文件读写实现。记起来以备后忘和供有需要的同学学习。
xml文件读写类:
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.lt.cj.config.entities.ConfigModel;
import org.lt.cj.config.entities.TMallConfigModel;
import org.lt.cj.core.Seed;
public class XMLConfigWriter {
/*创建淘宝商城的配置文件*/
public Document buildUpMallDocument(TMallConfigModel missionConfig) throws MissionConfigException, EnterUrlsException {
if (missionConfig == null) {
throw new MissionConfigException();
} else if (missionConfig.getSeeds().isEmpty()) {
return null;
}
// Create the root element
Element rootElement = new Element("website");
/* 设置网站属性 */
/* 设置网站名称 */
rootElement.setAttribute("name", missionConfig.getWebsiteName());
/*设置网站地址*/
rootElement.setAttribute("url", missionConfig.getWebsiteUrl());
//添加任务名称
Element taskElement = new Element("taskName");
taskElement.addContent(missionConfig.getTaskName());
rootElement.addContent(taskElement);
//构造种子列表节点
Element seeds = new Element("seeds");
for (int i = 0; i < missionConfig.getSeeds().size(); i++) {
Element seedElement = new Element("seed");
Element seedNameElement = new Element("seedName");
seedNameElement.addContent(missionConfig.getSeeds().get(i).getSeedName());
Element seedUrlElement = new Element("seedUrl");
seedUrlElement.addContent(missionConfig.getSeeds().get(i).getUrl());
Element seedSortNameElement = new Element("sortName");
seedSortNameElement.addContent(missionConfig.getSeeds().get(i).getSortName());
seedElement.addContent(seedSortNameElement);
seedElement.addContent(seedNameElement);
seedElement.addContent(seedUrlElement);
seeds.addContent(seedElement);
}
rootElement.addContent(seeds);
//定义匹配的要采集的URL链接fitUrl的节点
Element fiturls = new Element("fitUrls");
for (int i = 0; i < missionConfig.getFitUrlRegs().size(); i++) {
Element fitUrl = new Element("fit_url");
fitUrl.addContent(missionConfig.getFitUrlRegs().get(i));
fiturls.addContent(fitUrl);
}
rootElement.addContent(fiturls);//添加到根节点
//并发工作线程数
Element workingThreadsElement = new Element("workingThreads");
workingThreadsElement.addContent("" + missionConfig.getWorkingThreads());
rootElement.addContent(workingThreadsElement);//添加到根节点
//定义页面编码节点
Element pageEncodingElement = new Element("pageEncoding");
pageEncodingElement.addContent(missionConfig.getPageEncoding());
rootElement.addContent(pageEncodingElement);//添加到根节点
//定义下载图片控制标志节点
Element dwdPhoFlagElement = new Element("dwdPhoFlag");
dwdPhoFlagElement.addContent(missionConfig.getDwdPhoFlag());
rootElement.addContent(dwdPhoFlagElement);
//定义原语言节点
Element oriLan = new Element("orien_lan");
oriLan.addContent(missionConfig.getOrigLanguage());
Element transLan = new Element("trans_lan");
transLan.addContent(missionConfig.getTranLanguage());
rootElement.addContent(oriLan);//添加到根节点
rootElement.addContent(transLan);//添加到根节点
//定义匹配抓取信息的产品页面Url节点
Element pageUrlRegs = new Element("pageUrlRegs");
for (int i = 0; i < missionConfig.getPageReg().size(); i++) {
Element pageUrl = new Element("pageUrl");
pageUrl.addContent(missionConfig.getFitUrlRegs().get(i));
pageUrlRegs.addContent(pageUrl);
}
rootElement.addContent(pageUrlRegs);//添加到根节点
Map> map = missionConfig.getEntityReg();
List list = null;
Element pathElements = new Element("pathElements");
//直接循环算啦
//=====================================
Iterator iter = map.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry e = (Map.Entry) iter.next();
Element element = new Element(e.getKey() + "");
map = missionConfig.getEntityReg();
list = map.get(e.getKey() + "");
for (int i = 0; i < list.size(); i++) {
Element path = new Element("path");
path.addContent(list.get(i));
element.addContent(path);
}
pathElements.addContent(element);
}
rootElement.addContent(pathElements);
/* ===================================================== */
Document myDocument = new Document(rootElement);
return myDocument;
}
/* 创建文档文件 */
public void createConfigFile(Document document, String filepath) {
try {
/* 定义XML输出器 */
XMLOutputter xmlOutPutter = new XMLOutputter();
xmlOutPutter.setFormat(Format.getPrettyFormat());
File file = new File(filepath);
if (!file.exists()) {
if (file.createNewFile()) {
FileOutputStream fileOutputStream = new FileOutputStream(filepath);
xmlOutPutter.output(document, fileOutputStream);
return;
}
}
FileOutputStream fileOutputStream = new FileOutputStream(filepath);
xmlOutPutter.output(document, fileOutputStream);
} catch (java.io.IOException e) {
e.printStackTrace();
}
}
/* 重写文件 */
public void saveTask(String filePath, ConfigModel configModel) {
try {
TMallConfigModel tMallConfigModel = (TMallConfigModel) configModel;
Document document = buildUpMallDocument(tMallConfigModel);
if (document != null) {
createConfigFile(document, filePath);
}
} catch (MissionConfigException ex) {
Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex);
} catch (EnterUrlsException ex) {
Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex);
}
}
//* xml文件读取方法 */
public TMallConfigModel readMallDocument(String filePath) {
TMallConfigModel model = new TMallConfigModel();
SAXBuilder sb = new SAXBuilder();
try {
//读取基本配置信息
Document doc = sb.build(filePath); //构造文档对象
Element root = doc.getRootElement(); //获取根元素
String websiteName = root.getAttributeValue("name"); //获取网站名称
String websiteAddr = root.getAttributeValue("url"); //获取网站地址
model.setWebsiteName(websiteName); //设置网站名称
model.setWebsiteUrl(websiteAddr); //设置网站地址
Element taskNameElement = root.getChild("taskName"); //获取任务名内容
String taskName = taskNameElement.getText();
model.setTaskName(taskName);
//获取入口种子列表
List seedList = new ArrayList();
Element seedsElement = root.getChild("seeds");
List list = seedsElement.getChildren();
for (int i = 0; i < list.size(); i++) {
Element element = (Element) seedsElement.getChildren().get(i);
Seed seed = new Seed();
Element seedNameElement = element.getChild("seedName");
Element seedUrlElement = element.getChild("seedUrl");
Element seedSortNameElement = element.getChild("sortName");
seed.setSeedName(seedNameElement.getTextTrim());
seed.setUrl(seedUrlElement.getTextTrim());
seed.setSortName(seedSortNameElement.getTextTrim());
Element parentSeedElement = element.getChild("parentSeed");
if (parentSeedElement != null) {
Seed parentSeed = new Seed();
Element parentSeedNameElement = parentSeedElement.getChild("seedName");
Element parentSeedUrlElement = parentSeedElement.getChild("seedUrl");
Element parentSeedSortNameElement = parentSeedElement.getChild("sortName");
parentSeed.setSeedName(parentSeedNameElement.getText());
parentSeed.setUrl(parentSeedUrlElement.getTextTrim());
parentSeed.setSortName(parentSeedSortNameElement.getTextTrim());
}
seedList.add(seed);
}
model.setSeeds(seedList);
//获取匹配的要抽取的页面的特定部分内容
list = new ArrayList();
Element extractHtmlElement = root.getChild("extractHtml");
if (extractHtmlElement != null) {
for (int i = 0; i < extractHtmlElement.getChildren().size(); i++) {
Element element = (Element) extractHtmlElement.getChildren().get(i);
list.add(element.getText());
}
}
model.setExtractHtmlReg(list);
//获取匹配URLs
list = new ArrayList();
Element fitUrlsElement = root.getChild("fitUrls");
for (int i = 0; i < fitUrlsElement.getChildren().size(); i++) {
Element element = (Element) fitUrlsElement.getChildren().get(i);
list.add(element.getText());
}
model.setFitUrlRegs(list);
//获取线程数量
Element workingThreadsElement = root.getChild("workingThreads");
String workingCount = workingThreadsElement.getText();
model.setWorkingThreads(Integer.valueOf(workingCount));
//获取解析编码
Element pageEncodingElement = root.getChild("pageEncoding");
String pageEncoding = pageEncodingElement.getText();
model.setPageEncoding(pageEncoding);
//获取是否下载图片的标志
Element dwdPhoFlagElement = root.getChild("dwdPhoFlag");
String dphoFlag = dwdPhoFlagElement.getText();
model.setDwdPhoFlag(dphoFlag);
//获取语言
Element orien_lanElement = root.getChild("orien_lan");
String orien = orien_lanElement.getText();
model.setOrigLanguage(orien);
Element trans_lanElement = root.getChild("trans_lan");
String trans_lan = trans_lanElement.getText();
model.setTranLanguage(trans_lan);
//获取URL正则匹配
Element pageUrlRegsElement = root.getChild("pageUrlRegs");
list = new ArrayList();
for (int i = 0; i < pageUrlRegsElement.getChildren().size(); i++) {
Element element = (Element) pageUrlRegsElement.getChildren().get(i);
list.add(element.getText());
}
model.setPageReg(list);
//获取余下的匹配规则
Map> entityReg = new HashMap();
Element pathElements = root.getChild("pathElements");
for (int i = 0; i < pathElements.getChildren().size(); i++) {
Element element = (Element) pathElements.getChildren().get(i);
List pathList = new ArrayList();
String mapName = element.getName();
for (int j = 0; j < element.getChildren().size(); j++) {
Element childElement = (Element) element.getChildren().get(j);
pathList.add(childElement.getText());
}
entityReg.put(mapName, pathList);
}
model.setEntityReg(entityReg);
} catch (JDOMException ex) {
Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex);
} catch (IOException ex) {
Logger.getLogger(XMLConfigWriter.class.getName()).log(Level.SEVERE, null, ex);
}
return model;
}
} xml文件内容:
caiji_tmall_精品男装_T恤 精品男装/T恤 精品男装/T恤 http://item.tmall.com/item.htm?id=9351702393 div class="list item-view item-miniView" http://www\.tmall\.com/go/act/tmall/iwanttobuy\.php.* http://list\.tmall\.com/.* http://item\.tmall\.com/item\.htm.* 1 UTF-8 zh en http://www\.tmall\.com/go/act/tmall/iwanttobuy\.php.* div class="tb-box tshop-psm tshop-psm-bdetailtabl" id="J_Detail" div id="reviews" class="J_DetailSection" data-reviewApi div class="clearfix tb-header-nav" div class="nav" a href div id="attributes" class="attributes ul class="attributes-list li div class="tb-detail-bd tb-clear" div class="tb-gallery" div class="tb-booth tb-pic tb-s310" img id="J_ImgBooth" src ul class="mallCrumbs-nav" id="J_crumbs" li class="mallCrumbs-nav-item" div class="tb-detail-bd tb-clear" ul class="tb-meta" li class="tb-sold-out tb-clear" div class="shop-intro" div class="extend" li script div class="tb-detail-bd tb-clear" div class="tb-gallery" ul id="J_UlThumb" class="tb-thumb tb-clearfix" img src= div class="layout grid-s5m0 " div class="tb-detail-hd" a target="_blank" href= div class="tb-detail-bd tb-clear" ul class="tb-meta" li id="J_StrPriceModBox" class="tb-detail-price tb-clearfix"










