如何使用JDBC快速处理大数据-其它-@大数据资讯

在实习工作中，要处理一张存有204万记录的表，由于记录是从互联网上取到的，所以里面有一些不太理想的词，比如一些词里混有特殊字符，标点符号，或者有些词根本就是标点符号等等。我写这个程序的目的就是把这些不理想的词找出来，能修改的就修改，没有必要修改的就直接删除。

[java]view plaincopy 
for(int i=0;i<205;i++)  
            {  
                String sql="select * from cat_keyword order by id limit "+i*10001+",10000";           
                String best="select * from cat_keyword where id>=(select id from cat_keyword order by id limit "+i*10001+",1)limit 10000";  
                rs=stmt.executeQuery(best);  
                go.filt(rs);  
            }  

经试验，第一条sql语句的执行效率是明显不如第二条的。

String best="select * from cat_keyword where id>=(select id from cat_keyword order by id limit "+i*10001+",1)limit 10000";

这条SQL语句尤其适用于offset特别大的情景，因为200多万条记录，到中后期，这个offset是很大的，使用未经优化的SQL语句，到后期是越来越慢。

另外limit的取值也很有考究，我试过1000,10000,70000,100000的，最后发现limit取10000的时候速度是最快的，这个跟数据的大小，电脑以及数据库内存的分配都有一定的关系，大家可以根据具体情况具体讨论。我这里仅仅提一下需要注意，可以优化的point。

我真正想说的是我想的另外一种解决方案，运用JDBC

下面给大家说下我这个程序的构造，首先是一个实现将需要删除和修改的词插入另一张表的方法

[java]view plaincopy 
//将找到的违规词插入cat_garbage表里  
   public void insert(ResultSet rs) throws SQLException  
   {  
       Connection conn=null;  
        Statement stmt=null;  
       try{           
            conn=Good.getConnection();  
                stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE  
                    , ResultSet.CONCUR_UPDATABLE);  
                String query="select * from cat_garbage where id='"+rs.getInt("id")+"'";  
                ResultSet rst=stmt.executeQuery(query);  
                if(!rst.next())  
                {  
                    String sql1="insert into cat_garbage(id,cid,name,keyword) values('"+rs.getInt("id")+"','"+rs.getInt("cid")+"','"+rs.getString("name")+"','"+rs.getString("keyword")+"')";  
                    stmt.executeUpdate(sql1);  
                }  
        }  
        catch(SQLException|ClassNotFoundException|IOException e)  
        {  
            e.printStackTrace();  
        }  
        finally  
        {  
            if(stmt!= null)   
                stmt.close();         
            if(conn!= null)   
                conn.close();   
        }  
   }  

然后是一个将可以修复的词修复的方法

[java]view plaincopy 
public void modify(ResultSet rs,String str,String reg) throws SQLException  
   {  
        
       Connection conn=null;  
       Statement stmt=null;  
       Statement stmt1=null;  
       try{                   
            conn=Good.getConnection();  
            stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE  
                    , ResultSet.CONCUR_UPDATABLE);  
            stmt1= conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE  
                    , ResultSet.CONCUR_UPDATABLE);  
            String ok=null;  
            if(str.indexOf(reg)==str.length()-1)  
            {  
                 ok=str.substring(0,str.indexOf(reg));  
                 System.out.println(ok);  
                 System.out.println(ok);  
                 System.out.println(1);  
            }  
            else if(str.indexOf(reg)==0)  
            {  
                 ok=str.substring(1);  
                 while(ok.indexOf(reg)!=-1)  
                      ok=ok.substring(1);  
                 System.out.println(2);  
            }  
            else  
            {  
                System.out.println(3);  
                System.out.println(str);  
                ok=str.substring(0,str.indexOf(reg));  
                  
            }  
            System.out.println(ok);  
            new Good().insert(rs);  
            //String query="select * from cat_keyword where cid='"+rs.getInt("cid")+"'and keyword='"+ok+"'";  
            String sql="update cat_garbage1 set new='"+ok+"' where id='"+rs.getInt(1)+"'";  
            //String sql1="update cat_keyword set keyword='"+ok+"' where id='"+rs.getInt("id")+"'";  
            stmt.executeUpdate(sql);  
       }  
          
        catch(SQLException|ClassNotFoundException|IOException e)  
        {  
            e.printStackTrace();  
        }  
        finally  
        {  
            if(stmt!= null)   
                stmt.close();     
            if(stmt1!= null)   
                stmt1.close();   
            if(conn!= null)   
                conn.close();   
        }  
   }  

最后就是一个核心的取词过滤方法，里面有很多正则表达式实现的取词规则。

[java]view plaincopy 
public void filt(ResultSet rs)  
    {         
     try  
      {  
        String str;       
        double a=0;double b=0;double c=0;double d=0;  
        int num=0;  
        Good good=new Good();  
        while(rs.next())  
       {      
        str=rs.getString("keyword");  
        if(str.equals(",")|str.equals(".")|str.equals(":")|str.equals("-")|str.equals("=")|str.equals("+")|str.equals("*")|str.equals("/")|str.equals("?")|str.equals("|")|str.equals("\\")|str.equals("%")|str.equals("#")|str.equals("!")||str.equals("~")||str.equals("、"))  
        {  
            good.insert(rs);  
            a++;  
        }  
        //字符串后面带逗号  
        if(rule(str,"^\\S+[,]$").matches())  
        {  
            good.modify(rs,str,",");  
            num++;  
        }  
        //字符串后面带句号  
        if(rule(str,"^\\S+[.]$").matches())  
        {  
            good.modify(rs,str,".");  
            num++;  
        }  
        //字符串前面带句号  
        if(rule(str,"^[.]\\S+$").matches())  
        {  
            System.out.println(123);  
            if(str.indexOf(".net")==-1)  
            {  
                if(str.indexOf(".com")==-1)  
                {  
                    if(str.indexOf(".cn")==-1)  
                    {  
                        if(str.indexOf(".org")==-1)  
                        {  
                            good.modify(rs, str, ".");  
                            num++;  
                        }  
                    }  
                }  
            }  
        }  
        //字符串后面带-  
        if(rule(str,"^\\S+[-]$").matches())  
        {  
            good.modify(rs, str, "-");  
            num++;  
        }  
        //字符串前面带-  
        if(rule(str,"^[-]\\S+$").matches())  
        {  
            good.modify(rs, str, "-");  
            num++;  
        }  
        //字符串前面带冒号  
        if(rule(str,"^[:]\\S+$").matches())  
        {  
            good.modify(rs, str, ":");  
            num++;  
        }  
        //字符串后面带冒号  
        if(rule(str,"^\\S+[:]$").matches())  
        {  
            good.modify(rs, str, ":");  
            num++;  
        }  
        //字符串后面带/  
        if(rule(str,"^\\S+[/]$").matches())  
        {  
            good.modify(rs, str, "/");  
            num++;  
        }  
        //字符串前面带/  
        if(rule(str,"^[/]\\S+$").matches())  
        {  
            good.modify(rs, str, "/");  
            num++;  
        }  
        //字符串后面带？  
        if(rule(str,"^\\S+[?]$").matches())  
        {  
            good.modify(rs, str, "?");  
            num++;  
        }  
        //字符串前面带？  
        if(rule(str,"^[?]\\S+$").matches())  
        {  
            good.modify(rs, str, "?");  
            num++;  
        }  
        //字符串前面带|  
        if(rule(str,"^[|]\\S+$").matches())  
        {  
            good.modify(rs, str, "|");  
            num++;  
        }  
        //字符串后面带|  
        if(rule(str,"^\\S+[|]$").matches())  
        {  
            good.modify(rs, str, "|");  
            num++;  
        }  
        //字符串前面带(  
        if(rule(str,"^[(][\\S\\s]+$").matches())  
        {  
            if(rule(str,"^[(]\\S+[)]\\S*$").matches())  
            {  
                str=str.replaceAll("\\)", " ");  
                good.modify(rs, str, "(");  
                num++;  
            }  
            else  
            {  
                good.modify(rs, str, "(");  
                num++;  
            }  
        }  
        //字符串前面带[  
        if(rule(str,"^[\\[][\\S\\s]+$").matches())  
        {  
            if(rule(str,"^[\\[]\\S+[\\]]\\S*$").matches())  
            {  
                str=str.replaceAll("\\]", " ");  
                good.modify(rs, str, "[");  
                num++;  
            }  
            else  
            {  
                good.modify(rs, str, "[");  
                num++;  
            }  
        }  
        //字符串后面带)或者里面带()  
        if(rule(str,"^\\S+\\s*\\S+[)]$").matches())  
        {  
            System.out.println("qwe");  
            if(rule(str,"^\\S+[(]\\S+[)]$").matches())  
            {  
                str=str.replaceAll("\\(", " ");  
                good.modify(rs, str, ")");  
                num++;  
            }  
            else if(rule(str,"^\\S+\\s*[(]\\S+[)]$").matches())  
            {  
                str=str.replaceAll("\\(", "");  
                good.modify(rs, str, ")");  
                num++;  
            }  
            else  
                {  
                    good.modify(rs, str, ")");  
                    num++;  
                }  
        }  
        //字符串后面带]或者里面带[]  
                if(rule(str,"^\\S+\\s*\\S+[\\]]$").matches())  
                {  
                    System.out.println("qwe");  
                    if(rule(str,"^\\S+[\\[]\\S+[\\]]$").matches())  
                    {  
                        str=str.replaceAll("\\[", " ");  
                        good.modify(rs, str, "]");  
                        num++;  
                    }  
                    else if(rule(str,"^\\S+\\s*[\\[]\\S+[\\]]$").matches())  
                    {  
                        str=str.replaceAll("\\[", "");  
                        good.modify(rs, str, "]");  
                        num++;  
                    }  
                    else  
                        {  
                            good.modify(rs, str, "]");  
                            num++;  
                        }  
                }  
        //字符串内包含书名号  
        if(rule(str,"^[\\s\\S]*[《][\\s\\S]+[》]").matches())  
        {  
            str=str.replaceAll("》", " ");  
            good.modify(rs, str, "《");  
            num++;  
        }  
        //字符串内包含中文（）  
        if(rule(str,"^[\\s\\S]*[（][\\s\\S]+[）]").matches())  
        {  
            str=str.replaceAll("（", " ");  
            good.modify(rs, str, "）");  
            num++;  
        }  
        //纯数字  
        if(rule(str,"^[1-9]*|0$").matches())  
        {  
            good.insert(rs);  
            b++;  
        }  
        //单字母  
        if(rule(str,"^[a-zA-Z]$").matches())  
        {  
            int flag=0;  
            String shit=rs.getString(3);  
            for(int i=0;i<shit.length();i++)  
                {  
                    if(str.charAt(0)!=shit.charAt(i))                 
                    flag++;                           
                    if(flag==shit.length())  
                    {  
                        good.insert(rs);  
                        c++;                          
                    }                 
                }  
        }  
        //单汉字  
        if(rule(str,"^[\u4e00-\u9fa5]$").matches())  
        {  
            int flag=0;  
            String shit=rs.getString(3);  
            for(int i=0;i<shit.length();i++)  
            {                 
                if(str.charAt(0)!=shit.charAt(i))                                         
                flag++;                                                   
                if(flag==shit.length())  
                {  
                    good.insert(rs);  
                    d++;                          
                }                                     
            }                     
        }  
      }  
        System.out.println("单标点百分比："+100*a/2046036+"%");  
        System.out.println("纯数字百分比："+100*b/2046036+"%");  
        System.out.println("单字母百分比："+100*c/2046036+"%");  
        System.out.println("单汉字百分比："+100*d/2046036+"%");  
        System.out.println(num);  
        }  
        catch(SQLException e)  
        {  
            e.printStackTrace();  
        }     
          
    }  

下面是程序的主方法

[java]view plaincopy 
public static void main(String[] args) throws SQLException   
    {  
        Connection conn=null;  
        Statement stmt=null;  
        try{  
            long a=System.currentTimeMillis();  
            ResultSet rs;  
            conn=Good.getConnection();  
            stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE  
                    , ResultSet.CONCUR_UPDATABLE);  
            Good go=new Good();  
            /*for(int i=0;i<205;i++) 
            { 
                //String sql="select * from cat_keyword order by id limit "+i*10001+",10000";            
                String best="select * from cat_keyword where id>=(select id from cat_keyword order by id limit "+i*10001+",1)limit 10000"; 
                rs=stmt.executeQuery(best); 
                go.filt(rs); 
            }*/  
            String sql="select * from cat_keyword";  
            rs=stmt.executeQuery(sql);  
            go.filt(rs);  
            long b=System.currentTimeMillis();  
            System.out.println("耗时： "+(b-a)/60000+"分钟");  
            }  
        catch(ClassNotFoundException|SQLException|IOException e)  
        {  
            e.printStackTrace();  
        }  
        finally  
        {  
            if(stmt!= null)   
                stmt.close();         
            if(conn!= null)   
                conn.close();   
        }  
    }  

用一个SQL语句，将整个表的数据一次性作为参数传给负责取词过滤的filt()方法，然后利用ResultSet rs这个指针依次向下对每个词进行处理，这个方案运行下来处理完全表204万记录只需11分钟，而是用优化之后的limit语句获得的结果集再传给filt()方法那个方案，却需要26分钟。可见使用JDBC，直接传入所有数据的结果集，比使用limit多次限量导入数据要快的多，而且不止是一倍的关系。前者11分钟完胜后者的26分钟。