HtmlAgility 抓取网页上的数据

2021-04-10 15:26

阅读:593

标签:ESS   数据   oid   private   file   pdo   lag   ace   eai   

 /// 
    /// 财政部mca
    /// http://www.mca.gov.cn/article/sj/xzqh/1980/
    /// https://github.com/zzzprojects/html-agility-pack
    /// https://github.com/linezero/HtmlAgilityPack
    /// 
    public partial class Form1 : Form
    {

        int codecell = 2;
        int namecell = 3;
        int yearnmae = 2019;
        string tableNo = "table";
        string trNo = "tr";
        string tdthNo = "th|td";
        DataTable  McaData()
        {
            DataTable dt = new DataTable();
            dt.Columns.Add("year", typeof(int)); //年份
            dt.Columns.Add("website", typeof(string)); //财政部网址
            dt.Columns.Add("codecell", typeof(int));  //行政区划编码在表格的第几列
            dt.Columns.Add("namecell", typeof(int));  //行政区划名称在表格的第几列
            dt.Columns.Add("tableNo", typeof(string));  //表格标识
            dt.Columns.Add("trNo", typeof(string));  //行标识
            dt.Columns.Add("tdthNo", typeof(string));  //列标识
            dt.Rows.Add(2019, "http://www.mca.gov.cn/article/sj/xzqh/1980/2019/202002281436.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2018, "http://www.mca.gov.cn/article/sj/xzqh/1980/201903/201903011447.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2017, "http://www.mca.gov.cn/article/sj/xzqh/1980/201803/201803131454.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2016, "http://www.mca.gov.cn/article/sj/xzqh/1980/201705/201705311652.html", 2, 3, "table//tbody", "tr", "th|td");//TBODY 都是大写
            dt.Rows.Add(2015, "http://www.mca.gov.cn/article/sj/tjbz/a/2015/201706011127.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2014, "http://files2.mca.gov.cn/cws/201502/20150225163817214.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2013, "http://files2.mca.gov.cn/cws/201404/20140404125552372.htm", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2012, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201707271556.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2011, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201707271552.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2010, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220946.html", 2, 3, "table", "tr", "th|td");          
            dt.Rows.Add(2009, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220943.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2008, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220941.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2007, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220939.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2006, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220936.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2005, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220935.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2004, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220930.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2003, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220928.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2002, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220927.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2001, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220925.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(2000, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220923.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1999, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220921.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1998, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220918.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1997, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220916.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1996, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220914.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1995, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220913.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1994, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220911.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1993, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708041023.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1992, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220910.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1991, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708041020.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1990, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708041018.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1989, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708041017.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1988, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220903.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1987, "http://www.mca.gov.cn/article/sj/xzqh/1980/1980/201911180950.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1986, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220859.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1985, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220858.html", 2, 3, "table", "tr", "th|td");
            dt.Rows.Add(1984, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708220856.html", 1, 2, "table", "tr", "th|td");
            dt.Rows.Add(1983, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708160821.html", 1, 2, "table", "tr", "th|td");
            dt.Rows.Add(1982, "http://www.mca.gov.cn/article/sj/xzqh/1980/1980/201911180942.html", 1, 2, "table", "tr", "th|td");
            dt.Rows.Add(1981, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708041004.html", 1, 2, "table", "tr", "th|td");
            dt.Rows.Add(1980, "http://www.mca.gov.cn/article/sj/tjbz/a/201713/201708040959.html", 1, 2, "table", "tr", "th|td");
            return dt;
        }

        /// 
        /// 
        /// 
        public Form1()
        {
            InitializeComponent();
        }
        /// 
        /// 
        /// 
        /// 
        /// 
        private void Form1_Load(object sender, EventArgs e)
        {
            this.comboBox1.DataSource = McaData();
            this.comboBox1.DisplayMember = "year";
            this.comboBox1.ValueMember = "website";
        }
        /// 
        /// 抓取数据
        /// Geovin Du 涂聚文
        /// 
        /// 
        /// 
        private void button1_Click(object sender, EventArgs e)
        {
            try
            {
                string website = this.comboBox1.SelectedValue.ToString();

                //codecell =(int)this.numericUpDown1.Value;
                //namecell = (int)this.numericUpDown2.Value;
                HtmlAgilityPack.HtmlWeb webClient = new HtmlAgilityPack.HtmlWeb();
                HtmlAgilityPack.HtmlDocument doc = webClient.Load(website);
                this.richTextBox1.Text = doc.Text.ToLower();
                //HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div");
                //foreach (HtmlNode node in nodes)
                //{
                //    Console.WriteLine(node.InnerText.Trim());
                //}
                //nodes = null;

                yearnmae = int.Parse(this.comboBox1.Text);
                DataRow[] drsselect = McaData().Select("year="+yearnmae+"");
                for (int i = 0; i 

  

HtmlAgility 抓取网页上的数据

标签:ESS   数据   oid   private   file   pdo   lag   ace   eai   

原文地址:https://www.cnblogs.com/geovindu/p/12427358.html


评论


亲,登录后才可以留言!