php实现将HTML页面转换成word并且保存的方法
2018-09-07 17:45
本文实例讲述了php实现将HTML页面转换成word并且保存的方法。分享给大家供大家参考,具体如下:
这里用使用到一个PHP的工具叫:PHPWord。
生成Word的原理是,将堆规定好了的xml压缩成一个zip包,并且把后缀名改成doc或者docx即可。
所以使用PHPWord,需要你的PHP环境安装zip.dll压缩扩展,我写了一个demo.
功能说明:
20150507 — HTML中的<p>标签和<ol>列表标签的获取
20150508 — 新增获取文章中的图片功能
20150509 — 新增行间距,并且过滤一下错误图片
20150514 — 新增表格处理,并且将代码改成面向对象
20150519 — 新增GD库处理网络图片
require_once PHPWord.php; require_once SimpleHtmlDom.class.php; class Word{ private $url; private $LinetextArr = array(); public $CurrentDir; public $error = array(); //错误数组 public $filename = null; public $Allowtag = p,ol,ul,table; /**数据统计**/ public $DownImg = 0; public $expendTime = 0; public $HttpRequestTime = 0; public $ContentLen = 0; public $HttpRequestArr = array(); public $expendmemory = 0; public function __construct($url) { $startTime = $this->_Time(); $startMemory = $this->_memory(); $this->url = $url; $UrlArr = parse_url($this->url); $this->host = $UrlArr[scheme].://.$UrlArr[host]; $this->CurrentDir = getcwd(); $this->LinetextArr[table] = array(); $html = new simple_html_dom($this->url); $this->HttpRequestArr[] = $this->url; $this->HttpRequestTime++; foreach($html->find($this->Allowtag) as $key=>$value) { if($value->tag == table) { $this->ParseTable($value,0,$this->LinetextArr[table]); } else { $this->AnalysisHtmlDom($value); } $this->error[] = error_get_last(); } $endTime = $this->_Time(); $endMemory = $this->_memory(); $this->expendTime = round(($endTime-$startTime),2); //微秒 $this->expendmemory = round(($endMemory-$startMemory)/1000,2); //bytes $this->CreateWordDom(); } private function _Time() { return array_sum(explode( , microtime())); } private function _memory() { return memory_get_usage(); } /** * 解析HTML中的Table,这里考虑到多层table嵌套的情况 * @param $value HTMLDOM * @param $i 遍历层级 * **/ private function ParseTable($value,$i,$Arr) { if($value->firstChild() && in_array($value->firstChild()->tag,array(table,tbody,thead,tfoot,tr))) { foreach($value->children as $k=>$v) { $this->ParseTable($v,$i++,$Arr); } } else { foreach($value->children as $k=>$v) { if($v->firstChild() && $v->firstChild()->tag != table) { $Arr[$i][] = array(tag=>$v->tag,text=>trim($v->plaintext)); } if(!$v->firstChild()) { $Arr[$i][] = array(tag=>$v->tag,text=>trim($v->plaintext)); } } } } /** * 解析HTML里面的表情 * @param $value HTMLDOM * **/ private function AnalysisHtmlDom($value) { $tmp = array(); if($value->has_child()) { foreach($value->children as $k=>$v) { $this->AnalysisHtmlDom($v); } } else { if($value->tag == a) { $tmp = array(tag=>$value->tag,href=>$value->href,text=>$value->innertext); } else if($value->tag == img) { $src= $this->unescape($value->src); $UrlArr = parse_url($src); if(!isset($UrlArr[host])) { $src= $this->host.$value->src; $UrlArr = parse_url($src); } $src= $this->getImageFromNet($src,$UrlArr); //表示有网络图片,需要下载 if($src) { $imgsArr = $this->GD($src); $tmp = array(tag=>$value->tag,src=>$src,text=>$value->alt,width=>$imgsArr[width],height=>$imgsArr[height]); } } else { $tmp = array(tag=>$value->tag,text=>strip_tags($value->innertext)); } $this->LinetextArr[] = $tmp; } } /** * 根据GD库来获取图片的如果太多,进行比例压缩 * **/ private function GD($src) { list($width, $height, $type, $attr) = getimagesize($src); if($width > 800 $height > 800 ) { $width = $width/2; $height = $height/2; } return array(width=>$width,height=>$height); } /** * 将Uincode编码转移回原来的字符 * **/ public function unescape($str) { $str = rawurldecode($str); preg_match_all(/(?:%u.{4}).{4};\d+;.+/U,$str,$r); $ar = $r[0]; foreach($ar as $k=>$v) { if(substr($v,0,2) == %u){ $ar[$k] = iconv(UCS-2BE,UTF-8,pack(H4,substr($v,-4))); } elseif(substr($v,0,3) == ){ $ar[$k] = iconv(UCS-2BE,UTF-8,pack(H4,substr($v,3,-1))); } elseif(substr($v,0,2) == ){ $ar[$k] = iconv(UCS-2BE,UTF-8,pack(n,substr($v,2,-1))); } } return join(,$ar); } /** * 图片下载 * @param $Src 目标资源 * @param $UrlArr 目标URL对应的数组 * **/ private function getImageFromNet($Src,$UrlArr) { $file = basename($UrlArr[path]); $ext = explode(.,$file); $this->ImgDir = $this->CurrentDir./.$UrlArr[host]; $_supportedImageTypes = array(jpg, jpeg, gif, png, bmp, tif, tiff); if(isset($ext[1]) && in_array($ext[1],$_supportedImageTypes)) { $file = file_get_contents($Src); $this->HttpRequestArr[] = $Src; $this->HttpRequestTime++; $this->_mkdir(); //创建目录,或者收集错误 $imgName = md5($UrlArr[path])...$ext[1]; file_put_contents($this->ImgDir./.$imgName,$file); $this->DownImg++; return $UrlArr[host]./.$imgName; } return false; } /** * 创建目录 * **/ private function _mkdir() { if(!is_dir($this->ImgDir)) { if(!mkdir($this->ImgDir,7777)) { $this->error[] = error_get_last(); } } } /** * 构造WordDom * **/ private function CreateWordDom() { $PHPWord = new PHPWord(); $PHPWord->setDefaultFontName(宋体); $PHPWord->setDefaultFontSize(11); $styleTable = array(borderSize=>6, borderColor=>006699, cellMargin=>120); // New portrait section $section = $PHPWord->createSection(); $section->addText($this->Details(),array(),array(spacing=>120)); //数据进行处理 foreach($this->LinetextArr as $key=>$lineArr) { if(isset($lineArr[tag])) { if($lineArr[tag] == li) { $section->addListItem($lineArr[text],0,,,array(spacing=>120)); } else if($lineArr[tag] == img) { $section->addImage($lineArr[src],array(width=>$lineArr[width], height=>$lineArr[height], align=>center)); } else if($lineArr[tag] == p) { $section->addText($lineArr[text],array(),array(spacing=>120)); } } else if($key == table) { $PHPWord->addTableStyle(myOwnTableStyle, $styleTable); $table = $section->addTable(myOwnTableStyle); foreach($lineArr as $key=>$tr) { $table->addRow(); foreach($tr as $ky=>$td) { $table->addCell(2000)->addText($td[text]); } } } } $this->downFile($PHPWord); } public function Details() { $msg = 一共请求:{$this->HttpRequestTime}次,共下载的图片有{$this->DownImg}张,并且下载完成大约使用时间:{$this->expendTime}秒,整个程序执行大约消耗内存是:{$this->expendmemory}KB,; return $msg; } public function downFile($PHPWord) { if(empty($this->filename)) { $UrlArr = parse_url($this->url); $this->filename = $UrlArr[host]..docx; } // Save File $objWriter = PHPWord_IOFactory::createWriter($PHPWord, Word2007); $objWriter->save($this->filename); header(Pragma: public); header(Expires: 0); header(Cache-Control: must-revalidate, post-check=0, pre-check=0); header(Cache-Control: public); header(Content-Description: File Transfer); //Use the switch-generated Content-Type header(Content-type: application/msword);//输出的类型 //Force the download $header=Content-Disposition: attachment; filename=.$this->filename.;; header($header); @readfile($this->filename); } }
上面的代码重点感觉不是word生成,而是Simplehtmldom的使用,这是一个开源的HTML解析器,之前有提到,这几天在看他的代码,
引出了两个学习方向
① 正在表达式
② 这个扩展的函数整理
看源代码的收获:
PHP的异常是可以捕获的,而且PHP的错误也是可以捕获的。
error_get_last() //用这个函数可以捕获页面中的PHP错误,不谢。
更多关于PHP相关内容感兴趣的读者可查看本站专题:《php操作office文档技巧总结(包括word,excel,access,ppt)》、《PHP数组(Array)操作技巧大全》、《php排序算法总结》、《PHP常用遍历算法与技巧总结》、《PHP数据结构与算法教程》、《php程序设计算法总结》、《PHP数学运算技巧总结》、《php正则表达式用法总结》、《PHP运算与运算符用法总结》、《php字符串(string)用法总结》及《php常见数据库操作技巧汇总》
希望本文所述对大家PHP程序设计有所帮助。