采集資源的方法(文字,圖片) [圖片]


 

由於網絡的開放性,我們瀏覽網站都會把數據發送到本地,這就造就了采集的環境.

之前研究采集一段時間了,在劉建的幫助下,終於可以把別人網站的內容采集到自己的網站上面顯示出來,但是這樣有一個很大的弊端,那就是如果被采集的網站關閉了,你的網站也因為采集不到內容而顯示不了,解決這個問題的最好辦法還是把采集到的數據存放到本地,這樣就算別人掛了也對自己的網站沒影響,經過和劉建的討論,總結了如下采集流程,今天把它實現了.


圖片

根據流程圖,代碼如下:

using System;
using System.Collections.Generic;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Data.SqlClient;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;

public partial class chapter : System.Web.UI.Page
{
    protected string title, content,newContent, bookurl, readurl, provpage, nextpage, zhangjie, keywords, description;
    protected void Page_Load(object sender, EventArgs e)
    {
        bool hasbook = true;//書名是否存在
        bool hassecction = true;//章節是否存在
        string bookid = Request.QueryString["bookid"];
        string sectionid = Request.QueryString["chapterid"];
        jiangs_Tools.check_str(bookid);
        jiangs_Tools.check_str(sectionid);
        jiang_Db_Sql newdb = new jiang_Db_Sql();
        string sql = "select count(*) from [book] where [bookid]=" + bookid;
        newdb.Open();
        hasbook = newdb.Exec_Sql(sql);
        newdb.Close();       
        sql = "select count(*) from [section] where [sectionid]=" + sectionid;
        newdb.Open();
        hassecction = newdb.Exec_Sql(sql);
        newdb.Close();
        if (!hasbook)//書名不存在,添加
        {
            string html = jiangs_Rex.GetRemoteHtmlCode("http://www.lovepd.com/chapter.php?cutid=" + Request.QueryString["cutid"] + "&bookid=" + Request.QueryString["bookid"] + "&chapterid=" + Request.QueryString["chapterid"]);
            html = html.Replace("http://www.lovepd.com/", "");
            html = html.Replace("http://lovepd.com/", "");
            html = jiangs_Rex.ReplaceListUrl(html, @"(chapter)\.php\?cutid=(\d+)&bookid=(\d+)&chapterid=(\d+)", 4);
            html = jiangs_Rex.ReplaceListUrl(html, @"(read)\.php\?cutid=(\d+)\&bookid=(\d+)", 3);//小說目錄
            content = jiangs_Rex.GetRegValue(@"(?<=<p align=""left"">)(.*?)(?=</p>)", html, 1);//內容
            html = jiangs_Rex.ReplaceListUrl(html, @"readend\.php\?bookid=(\d+)", 1, "/read/" + Request.QueryString["cutid"]);//小說目錄
            content = jiangs_Rex.ReplaceContentImgUrl(content);//如果是圖片小說,轉換
            content = content.Replace("第九文學 www.d9123.com", "");
            title = jiangs_Rex.GetRegValue(@"(\w+)最新章節列表</b>", html, 1);//小說名稱
            bookurl = "/book/" + Request.QueryString["bookid"] + ".html";//簡介url
            readurl = jiangs_Rex.GetRegValue(@"main_page = ""([^""]+)""", html, 1);//前一頁url
            provpage = jiangs_Rex.GetRegValue(@"back_page = ""([^""]+)""", html, 1);//前一頁url
            nextpage = jiangs_Rex.GetRegValue(@"next_page = ""([^""]+)""", html, 1);//下一頁url
            zhangjie = jiangs_Rex.GetRegValue(@"(?<=<h1>)([^>]+)(?=</h1>)", html, 1);//章節
            Response.Flush();//先輸出內容,減少用戶等待

            newContent = content;//替換存到數據庫的圖片新路徑
            sql = string.Format("insert into [book]([bookId],[bookName]) values('{0}','{1}')", bookid, title);
            newdb.Open();
            newdb.ExecSql(sql);//添加書名
            newdb.Close();
            string imgContent = jiangs_Rex.GetRegValue(@"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", content, "|", "");//取得內容圖片
            if (!string.IsNullOrEmpty(imgContent))
            {
                savePic(imgContent);//保存圖片
            }
            sql = sql = string.Format("insert into [section]([sectionId],[bookId],[sectionTitle],[sectionContent],[readUrl],[provPage],[nextPage]) values('{0}','{1}','{2}','{3}','{4}','{5}','{6}')", sectionid, bookid, zhangjie, newContent, readurl, provpage, nextpage);
            newdb.Open();
            newdb.ExecSql(sql);//添加章節
            newdb.Close();
        }
        else if(!hassecction)//章節不存在
        {
            string html = jiangs_Rex.GetRemoteHtmlCode("http://www.lovepd.com/chapter.php?cutid=" + Request.QueryString["cutid"] + 
                "&bookid=" + Request.QueryString["bookid"] + "&chapterid=" + Request.QueryString["chapterid"]);
            html = html.Replace("http://www.lovepd.com/", "");
            html = html.Replace("http://lovepd.com/", "");
            html = jiangs_Rex.ReplaceListUrl(html, @"(chapter)\.php\?cutid=(\d+)&bookid=(\d+)&chapterid=(\d+)", 4);
            html = jiangs_Rex.ReplaceListUrl(html, @"(read)\.php\?cutid=(\d+)\&bookid=(\d+)", 3);//小說目錄
            content = jiangs_Rex.GetRegValue(@"(?<=<p align=""left"">)(.*?)(?=</p>)", html, 1);//內容
            html = jiangs_Rex.ReplaceListUrl(html, @"readend\.php\?bookid=(\d+)", 1, "/read/" + Request.QueryString["cutid"]);//小說目錄
            content = jiangs_Rex.ReplaceContentImgUrl(content);//如果是圖片小說,轉換
            content = content.Replace("第九文學 www.d9123.com", "");
            title = jiangs_Rex.GetRegValue(@"(\w+)最新章節列表</b>", html, 1);//小說名稱
            bookurl = "/book/" + Request.QueryString["bookid"] + ".html";//簡介url
            readurl = jiangs_Rex.GetRegValue(@"main_page = ""([^""]+)""", html, 1);//前一頁url
            provpage = jiangs_Rex.GetRegValue(@"back_page = ""([^""]+)""", html, 1);//前一頁url
            nextpage = jiangs_Rex.GetRegValue(@"next_page = ""([^""]+)""", html, 1);//下一頁url
            zhangjie = jiangs_Rex.GetRegValue(@"(?<=<h1>)([^>]+)(?=</h1>)", html, 1);//章節
            Response.Flush();//先輸出內容,減少用戶等待

            newContent = content;//替換存到數據庫的圖片新路徑
            string imgContent = jiangs_Rex.GetRegValue(@"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", content, "|", "");//取得內容圖片
            if (!string.IsNullOrEmpty(imgContent))
            {
                savePic(imgContent);//保存圖片
            }
            sql = sql = string.Format("insert into [section]([sectionId],[bookId],[sectionTitle],[sectionContent],[readUrl],[provPage],[nextPage]) values('{0}','{1}','{2}','{3}','{4}','{5}','{6}')",
                sectionid, bookid, zhangjie, newContent,readurl,provpage,nextpage);
            newdb.Open();
            newdb.ExecSql(sql);//添加章節
            newdb.Close();
        }
        else//章節,書名都存在,直接讀數據庫
        {
            sql = "select a.[bookName],b.[sectionTitle],b.[sectionContent],b.[readUrl],b.[provPage],b.[nextPage] from [book] as a,[section] as b where a.[bookId]=b.[bookId] and b.[sectionId]=" + sectionid;
            newdb.Open();
            SqlDataReader reader = newdb.Re_dr(sql);
            if (reader.Read())
            {
                title = reader[0].ToString();//書名
                zhangjie = reader[1].ToString();//章節名稱
                content = reader[2].ToString();//內容
                readurl = reader[3].ToString();//書目
                provpage=reader[4].ToString();//上一頁
                nextpage=reader[5].ToString();//下一頁
                bookurl = "/book/" + Request.QueryString["bookid"] + ".html";//書頁
            }
            reader.Close();
            newdb.Close();
        }
        this.Page.Title = title + ">> " + zhangjie + " - 天下小說網";
        keywords = "\"" + title + "最新章節列表," + title + "全文閱讀," + title + "TXT電子書下載," + title + "JAR電子書下載," + title + "UMD電子書下載\"";
        description = "\"天下小說網為小說愛好者提供" + title + "最近更新章節閱讀," + title + "全文在線閱讀," + title
           + "最新章節電子書下載(包括" + title + "的TXT格式下載、" + title + "的JAR格式下載、" + title + "的UMD格式下載)\"";

    }
    public void savePic(string imgcontent)
    {
        if (string.IsNullOrEmpty(imgcontent))
        {
            return;
        }
        imgcontent = imgcontent.Remove(0, 1);//去除第一個 |
        string[] temp = imgcontent.Split('|');
        for (int i = 0; i < temp.Length; i++)//有幾張圖片就存幾次
        {
            string newUrl = temp[i];//重寫URL
            newUrl= Regex.Replace(newUrl, @"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", "files/article/attachment/$1/$2/$3/$4.$5");
            string[] tem = temp[i].Split('/');
            string imgName = tem[tem.Length - 1];//圖片名稱
            string picurl = Request.QueryString["url"];
            picurl = "http://2.yxmimi.com/" + newUrl;//目標網站圖片地址
            WebClient objWebClient = new WebClient();
            try
            {
                byte[] bResponse = objWebClient.DownloadData(picurl);//將下載數據保存到byte[]數組中
                FileStream fs = new FileStream(Server.MapPath("/pic/section/"+imgName), FileMode.Create, FileAccess.Write);
                fs.Write(bResponse, 0, bResponse.Length);//將bytes[]數組中的圖片數據保存到硬盤
                fs.Flush();
                fs.Close();

            }
            catch (Exception ex)
            {
                //Response.Write( ex.ToString());
            }
        }
        newContent = Regex.Replace(newContent, @"/files/article/attachment/(\d+)_(\d+)_(\d+)_(\d+)\.(\w+)", "/pic/section/$1_$2_$3_$4.$5");
    }

}


 


注意!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系我们删除。



 
粤ICP备14056181号  © 2014-2020 ITdaan.com