用户登录  |  用户注册
首 页源码下载网络学院最新源码源码排行屏蔽广告
当前位置:新兴网络 > 网络学院 > Java编程 > Java Web

Java采集网络数据并导入数据库实例

减小字体 增大字体 作者:佚名  来源:本站整理  发布时间:2010-06-10 22:41:03

下面所用到的正则表达式可以参考:http://www.newxing.com/regex/

采集百度的歌曲名,歌手和链接实例:

Java code复制代码
package webTools;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import dbTools.DBTools;

public class IOTOWeb {
    public String getHtmlContent(String htmlURL) {
        URL url = null;
        String rowContent = "";
        StringBuffer htmlContent = new StringBuffer();
        try {
            url = new URL(htmlURL);
            BufferedReader in = new BufferedReader(new InputStreamReader(url
                    .openStream(), "gb2312"));
            while ((rowContent = in.readLine()) != null) {
                htmlContent.append(rowContent);
            }
            in.close();
        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (UnsupportedEncodingException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return htmlContent.toString();
    }

    public List getLink(String htmlContent) {
        ArrayList listLink = new ArrayList();
        String regex = "<td[^>]*>[\\(]*<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)[\\)]*[\\s]*</td>";
        Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);
        Matcher matcher = pattern.matcher(htmlContent);
        while (matcher.find()) {
            listLink.add(matcher.group());
        }
        return listLink;
    }

    public List<String> getHref(String htmlContent) {
        String regex;
        List listtHref = new ArrayList();
        regex = "href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))\"";
        Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        Matcher ma = pa.matcher(htmlContent);
        while (ma.find()) {
            listtHref.add(ma.group().replaceFirst("href=\"", "").replace("\"",
                    ""));
        }
        return listtHref;
    }

    public List<String> getPerson(String htmlContent) {
        String regex;
        List list = new ArrayList();
        regex = "\\(<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)\\)";
        Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        Matcher ma = pa.matcher(htmlContent);
        while (ma.find()) {
            list.add(ma.group().replaceFirst("href=\"", "").replace("\"", ""));
        }
        return list;
    }

    public List<String> getSongName(String htmlContent) {
        String regex;
        List listPerson = new ArrayList();
        regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>\\s";
        Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
        Matcher ma = pa.matcher(htmlContent);
        while (ma.find()) {
            listPerson.add(ma.group());
        }
        return listPerson;
    }

    public String getMainContent(String htmlContent) {
        String regex = "<table width=\"100%\" align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"list\">(.*?)</table>";
        StringBuffer mainContent = new StringBuffer();
        Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);
        Matcher matcher = pattern.matcher(htmlContent);
        while (matcher.find()) {
            mainContent.append(matcher.group());
        }
        return mainContent.toString();
    }

    public String outTag(final String s) {
        return s.replaceAll("<.*?>", "");
    }

    DBTools dbTools = new DBTools();

    public void getFromBaiduMap3(String htmlURL) throws Throwable {
        HashMap htmlContentMap = new HashMap();
        String htmlContent = getHtmlContent(htmlURL);
        String mainContent = getMainContent(htmlContent);
        List listLink = getLink(mainContent);
        for (int j = 0; j < listLink.size(); j++) {
            String tdTag = listLink.get(j).toString();
            List songNameList = getSongName(tdTag);
            String songName = outTag(songNameList.get(0).toString());
            List personList = getPerson(tdTag);
            String songPerson = "";
            if (personList.size() != 0) {
                for (int n = 0; n < personList.size(); n++) {
                    // System.out.println(personList.get(n).toString());
                    songPerson = outTag(personList.get(n).toString());
                }
            } else {
                songPerson = "无";
            }
            // System.out.print(songNameList.get(0).toString());
            List hrefList = getHref(songNameList.get(0).toString());
            String songHref = hrefList.get(0).toString();
            System.out.println();
            String sql = "insert into song(songName,songPerson,songHref) values(?,?,?)";
            ArrayList list_values = new ArrayList();
            list_values.add(songName);
            list_values.add(songPerson);
            list_values.add(songHref);
            dbTools.update(sql, list_values);
        }
    }
}

DBTools数据库链接类:

Java code复制代码
package dbTools;

import java.util.ArrayList;

import java.sql.*;

public class DBTools {
    private PreparedStatement preparedStatement;
    private ResultSet resultSet;
    private Connection connection;

    public DBTools() {
        try {
            Class.forName("com.mysql.jdbc.Driver");
        } catch (ClassNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        try {
            connection = DriverManager.getConnection(
                    "jdbc:mysql://localhost:3306/TestURL", "root", "zhuyi");
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public ArrayList query(String sql, ArrayList list_values) throws Throwable {
        ArrayList listRows = new ArrayList();
        preparedStatement = connection.prepareStatement(sql);
        for (int i = 0; i < list_values.size(); i++) {
            preparedStatement.setObject(i + 1, list_values.get(i));
        }
        resultSet = preparedStatement.executeQuery();
        while (resultSet.next()) {
            String[] rowinfo = new String[resultSet.getMetaData()
                    .getColumnCount()];
            for (int i = 0; i < rowinfo.length; i++) {
                rowinfo[i] = resultSet.getString(i + 1);
            }
            listRows.add(rowinfo);
        }
        return listRows;
    }

    public void update(String sql, ArrayList list_values) throws Throwable {
        preparedStatement = connection.prepareStatement(sql);
        for (int i = 0; i < list_values.size(); i++) {
            preparedStatement.setObject(i + 1, list_values.get(i));
        }
        preparedStatement.executeUpdate();
        preparedStatement.close();
    }
}

Servlet调用:

Java code复制代码
package controller;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.List;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import webTools.IOTOWeb;

public class TestURL extends HttpServlet {

    /**
     * Constructor of the object.
     */
    public TestURL() {
        super();
    }

    /**
     * Destruction of the servlet. <br>
     */
    public void destroy() {
        super.destroy(); // Just puts "destroy" string in log
        // Put your code here
    }

    /**
     * The doGet method of the servlet. <br>
     *
     * This method is called when a form has its tag value method equals to get.
     *
     * @param request
     *            the request send by the client to the server
     * @param response
     *            the response send by the server to the client
     * @throws ServletException
     *             if an error occurred
     * @throws IOException
     *             if an error occurred
     */
    public void doGet(HttpServletRequest request, HttpServletResponse response)
            throws ServletException, IOException {
        try {
            IOTOWeb iotoWeb = new IOTOWeb();
            iotoWeb.getFromBaiduMap3("http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2");
        } catch (Throwable e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    /**
     * The doPost method of the servlet. <br>
     *
     * This method is called when a form has its tag value method equals to
     * post.
     *
     * @param request
     *            the request send by the client to the server
     * @param response
     *            the response send by the server to the client
     * @throws ServletException
     *             if an error occurred
     * @throws IOException
     *             if an error occurred
     */
    public void doPost(HttpServletRequest request, HttpServletResponse response)
            throws ServletException, IOException {

        response.setContentType("text/html");
        PrintWriter out = response.getWriter();
        out
                .println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">");
        out.println("<HTML>");
        out.println("  <HEAD><TITLE>A Servlet</TITLE></HEAD>");
        out.println("  <BODY>");
        out.print("    This is ");
        out.print(this.getClass());
        out.println(", using the POST method");
        out.println("  </BODY>");
        out.println("</HTML>");
        out.flush();
        out.close();
    }

    /**
     * Initialization of the servlet. <br>
     *
     * @throws ServletException
     *             if an error occurs
     */
    public void init() throws ServletException {
        // Put your code here
    }

}

获取金书网的图书名:

Java code复制代码
package webTools;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import dbTools.DBTools;

public class GetBook {
    public String getHtmlContent(String htmlURL) throws Throwable {
        URL url = null;
        String rowContent = "";
        StringBuffer htmlContent = new StringBuffer();
        url = new URL(htmlURL);
        BufferedReader in = new BufferedReader(new InputStreamReader(url
                .openStream(), "gb2312"));
        while ((rowContent = in.readLine()) != null) {
            htmlContent.append(rowContent);
        }
        in.close();

        return htmlContent.toString();
    }

    public String getBookName(String htmlContent) {
        String bookName = "";
        String regex = "<span class=\"style15\">[^>]*</span>";
        Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);
        Matcher matcher = pattern.matcher(htmlContent);
        if (matcher.find()) {
            bookName = matcher.group();
        }
        return bookName;
    }

    public String outTag(final String s) {
       
        return s.replaceAll("<.*?>", "");
    }

    DBTools dbtools = new DBTools();

    public void getFromJINSHU(String htmlURL) throws Throwable {
        String htmlContent = getHtmlContent(htmlURL);
        String bookName = outTag(getBookName(htmlContent));
        if (bookName != null && !"".equals(bookName)) {
            System.out.println(bookName);
            String sql = "insert into bookinfo(bookName) values(?)";
            ArrayList list_values = new ArrayList();
            list_values.add(bookName);
            dbtools.update(sql, list_values);
        }
    }
}

调用Servlet:

Java code复制代码
package controller;

import java.io.IOException;
import java.io.PrintWriter;

import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import webTools.GetBook;

public class TestBook extends HttpServlet {

    /**
     * Constructor of the object.
     */
    public TestBook() {
        super();
    }

    /**
     * Destruction of the servlet. <br>
     */
    public void destroy() {
        super.destroy(); // Just puts "destroy" string in log
        // Put your code here
    }

    /**
     * The doGet method of the servlet. <br>
     *
     * This method is called when a form has its tag value method equals to get.
     *
     * @param request
     *            the request send by the client to the server
     * @param response
     *            the response send by the server to the client
     * @throws ServletException
     *             if an error occurred
     * @throws IOException
     *             if an error occurred
     */
    int i = 1;

    public void doGet(HttpServletRequest request, HttpServletResponse response)
            throws ServletException, IOException {
        GetBook bookinfo = new GetBook();

        for (; i < 10000; i++) {
            String bookURL = "http://www.golden-book.com/booksinfo/12/" + i
                    + ".html";
            try {
                bookinfo.getFromJINSHU(bookURL);
            } catch (Throwable e) {
                i++;
                doPost(request, response);
            }
        }
    }

    /**
     * The doPost method of the servlet. <br>
     *
     * This method is called when a form has its tag value method equals to
     * post.
     *
     * @param request
     *            the request send by the client to the server
     * @param response
     *            the response send by the server to the client
     * @throws ServletException
     *             if an error occurred
     * @throws IOException
     *             if an error occurred
     */
    public void doPost(HttpServletRequest request, HttpServletResponse response)
            throws ServletException, IOException {
        GetBook bookinfo = new GetBook();

        for (; i < 10000; i++) {
            String bookURL = "http://www.golden-book.com/booksinfo/12/" + i
                    + ".html";
            try {
                bookinfo.getFromJINSHU(bookURL);
            } catch (Throwable e) {
                i++;
                doGet(request, response);
            }
        }
    }

    /**
     * Initialization of the servlet. <br>
     *
     * @throws ServletException
     *             if an error occurs
     */
    public void init() throws ServletException {
        // Put your code here
    }
}

Tags:Java采集 导入数据库

作者:佚名
  • 好的评价 如果您觉得此文章好,就请您
      100%(3)
  • 差的评价 如果您觉得此文章差,就请您
      0%(0)

网络学院评论评论内容只代表网友观点,与本站立场无关!

   评论摘要(共 0 条,得分 0 分,平均 0 分) 查看完整评论