原创

MySQL-数据修复-删除重复线上考试数据-delete or update-以两个字段分组-保留小id行的数据-having等说明-按字段group b


    -- 同时有两个字段的判重 exam_id,login_id 
    select * from exams_member m 
    inner JOIN (SELECT exam_id,login_id FROM exams_member GROUP BY exam_id,login_id HAVING COUNT(1)>1) t
    on t. exam_id = m.exam_id and t.login_id = m.login_id
    where m.id not in (SELECT MIN(id) AS id FROM exams_member GROUP BY exam_id,login_id HAVING COUNT(1)>1)
    -- and m.exam_id = 3738 and m.login_id = 'hbwss';

    DELETE from exams_member where id in (SELECT id from (select id from exams_member m 
    inner JOIN (SELECT exam_id,login_id FROM exams_member GROUP BY exam_id,login_id HAVING COUNT(1)>1) t
    on t. exam_id = m.exam_id and t.login_id = m.login_id
    where m.id not in (SELECT MIN(id) AS id FROM exams_member GROUP BY exam_id,login_id HAVING COUNT(1)>1)
    -- and m.exam_id = 3738 and m.login_id = 'hbwss'
    ) a);


    select * from exam_marking m inner JOIN (SELECT work_id,user_id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1) t
    on t. work_id = m.work_id and t.user_id = m.user_id
    where m.id not in (SELECT MIN(id) AS id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1)
    and m.is_delete = 0
    -- and m.work_id = 3738 and m.user_id = 'hbwss';

    select count(1) from exam_marking m inner JOIN (SELECT work_id,user_id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1) t
    on t. work_id = m.work_id and t.user_id = m.user_id
    where m.id not in (SELECT MIN(id) AS id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1)
    and m.is_delete = 0
    -- and m.work_id = 3738 and m.user_id = 'hbwss';


    -- 执行update 包了2层
    update exam_marking set is_delete = 1 where id in (select id from (select m.id from exam_marking m inner JOIN (SELECT work_id,user_id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1) t
    on t. work_id = m.work_id and t.user_id = m.user_id
    where m.id not in (SELECT MIN(id) AS id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1)
    and m.is_delete = 0
    -- and m.work_id = 3738 and m.user_id = 'hbwss'
    ) as a);
   *********************如果要带上条件的话,FROM exam_marking GROUP BY work_id,user_id 中间是不是也可以加上条件*******************************



    以上带and m.work_id = 3738 and m.user_id = 'hbwss'就是处理指定考试、人的数据。不带条件就是处理全表数据。文本还有按指定容器处理数据的。

    -- 136条
    SELECT exam_id,login_id,count(1) as c from exams_member  GROUP BY  exam_id,login_id HAVING c > 1 ;
    -- 14360条
    SELECT work_id,user_id,count(1) FROM exam_marking where is_delete = 0 GROUP BY work_id,user_id HAVING COUNT(1) > 1;

    -- 手动处理
    SELECT * from  exam_marking WHERE work_id = 3738 and user_id = 'hbwss';
    SELECT * from exams_member WHERE exam_id = 3738 and login_id = 'hbwss';
    -- 删除大的id
    DELETE from exams_member where id = 289199;
    update exam_marking set is_delete = 1 where id =  475506

    -- 验证
    -- 136条
    SELECT exam_id,login_id,count(1) as c from exams_member  GROUP BY  exam_id,login_id HAVING c > 1 ;
    -- 14360条 ( create_time > 1646049600) 2022-02-28 20:00:00
    SELECT work_id,user_id,count(1) FROM exam_marking where is_delete = 0  and create_time > UNIX_TIMESTAMP('2022-02-28 20:00:00') GROUP BY work_id,user_id HAVING COUNT(1)>1 ;  也有=1 来判断一下有没有创建过考试
    SELECT work_id,user_id,count(1) FROM exam_marking where is_delete = 0  and enterprise_id = 'jiangsudipukejigufen' GROUP BY work_id,user_id HAVING COUNT(1)>1 ;

        -- 培训的考试 修复完并发问题后 
        SELECT work_id,user_id,count(1),FROM_UNIXTIME(w.create_time) create_time FROM exam_marking m left join exams_work w on w.id = m.work_id where m.is_delete = 0  and m.create_time > UNIX_TIMESTAMP('2022-02-28 20:00:00') and w.exam_source = 1 GROUP BY m.work_id,m.user_id HAVING COUNT(1)>1  order by create_time desc ; 
        为空
        -- 培训的考试 所有的数据
        mysql> SELECT m.id, work_id,user_id,count(1),FROM_UNIXTIME(w.create_time) create_time FROM exam_marking m left join exams_work w on w.id = m.work_id where m.is_delete = 0 and w.exam_source = 1 GROUP BY m.work_id,m.user_id HAVING COUNT(1)>1  order by create_time desc ;
        -- 还有11条22月2月17号数据
        +--------+---------+-----------------+----------+---------------------+
        | id     | work_id | user_id         | count(1) | create_time         |
        +--------+---------+-----------------+----------+---------------------+
        | 472924 |    3690 | lecturer02      |        2 | 2022-02-17 20:39:21 |
        | 472927 |    3690 | lecturer_two    |        2 | 2022-02-17 20:39:21 |
        | 472923 |    3690 | administrator02 |        2 | 2022-02-17 20:39:21 |
        | 472926 |    3690 | lecturer_one    |        2 | 2022-02-17 20:39:21 |
        | 472925 |    3690 | lecturer1       |        2 | 2022-02-17 20:39:21 |
        | 472762 |    3687 | admin_51        |        2 | 2022-02-17 19:35:11 |
        | 472752 |    3686 | administrator02 |        2 | 2022-02-17 19:18:05 |
        | 472755 |    3686 | lecturer_one    |        2 | 2022-02-17 19:18:05 |
        | 472754 |    3686 | lecturer1       |        2 | 2022-02-17 19:18:05 |
        | 472753 |    3686 | lecturer02      |        2 | 2022-02-17 19:18:05 |
        | 472756 |    3686 | lecturer_two    |        2 | 2022-02-17 19:18:05 |
        +--------+---------+-----------------+----------+---------------------+
        11 rows in set (0.03 sec)
        修复
        update exam_marking set is_delete = 1 where id in (select id from (select m.id from exam_marking m inner JOIN (SELECT work_id,user_id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1) t
            on t. work_id = m.work_id and t.user_id = m.user_id
            where m.id not in (SELECT MIN(id) AS id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1)
            and m.is_delete = 0
             and m.work_id in (3690,3687,3686)
        ) as a);
        Query OK, 11 rows affected (4.45 sec)
        Rows matched: 11  Changed: 11  Warnings: 0

        另外发现的数据都是来自普通的
        SELECT work_id,user_id,count(1),FROM_UNIXTIME(max(w.create_time)) maxCreate_time,FROM_UNIXTIME(min(w.create_time)) minCreate_time FROM exam_marking m left join exams_work w on w.id = m.work_id where m.is_delete = 0  and w.exam_source = 0 GROUP BY m.work_id,m.user_id HAVING COUNT(1)>1 order by maxCreate_time desc limit 10; 

        -- 2021-09-30 之前的数据重复
        SELECT work_id,user_id,count(1),FROM_UNIXTIME(create_time) FROM exam_marking where is_delete = 0  and create_time < UNIX_TIMESTAMP('2021-09-30 00:00:00') GROUP BY work_id,user_id HAVING COUNT(1)>1 ; 
        14226 rows

        SELECT work_id,user_id,FROM_UNIXTIME(max(create_time)) maxCreate_time,FROM_UNIXTIME(min(create_time)) FROM exam_marking where is_delete = 0  GROUP BY work_id,user_id HAVING COUNT(1)>1 ;
        // 最早最晚的时间
          SELECT FROM_UNIXTIME(max(create_time)) maxCreate_time,FROM_UNIXTIME(min(create_time)) FROM exam_marking where id in (SELECT id FROM exam_marking where is_delete = 0  GROUP BY work_id,user_id HAVING COUNT(1)>1);
        +---------------------+---------------------------------+
        | maxCreate_time      | FROM_UNIXTIME(min(create_time)) |
        +---------------------+---------------------------------+
        | 2022-04-21 09:12:02 | 2019-09-26 17:32:03             |
        +---------------------+---------------------------------+


        我没有改过普通的考试的加人,我只搞过培训和课程的
         select id, enterprise_id,work_id,user_id,FROM_UNIXTIME(create_time) from exam_marking  WHERE FROM_UNIXTIME(create_time) = '2022-04-21 09:12:02';
        +--------+---------------------+---------+-------------------+----------------------------+
        | id     | enterprise_id       | work_id | user_id           | FROM_UNIXTIME(create_time) |
        +--------+---------------------+---------+-------------------+----------------------------+
        | 490278 | qiyeyunkaoshixitong |    3974 | u1000002142634390 | 2022-04-21 09:12:02        |
        | 490279 | qiyeyunkaoshixitong |    3974 | u6000000503085869 | 2022-04-21 09:12:02        |
        | 490280 | qiyeyunkaoshixitong |    3974 | u1000002142634390 | 2022-04-21 09:12:02        |
        | 490281 | qiyeyunkaoshixitong |    3974 | u6000000503085869 | 2022-04-21 09:12:02        |
        +--------+---------------------+---------+-------------------+----------------------------+
        4 rows in set (0.36 sec)


        --给志强和建国的start-
         select count(1) from exam_marking m inner JOIN (SELECT work_id,user_id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1) t
         on t. work_id = m.work_id and t.user_id = m.user_id
         where m.id not in (SELECT MIN(id) AS id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1)
         and m.is_delete = 0;
         -- 185928 条重复数据

        -- 逻辑删除
        update exam_marking set is_delete = 1 where id in (select id from (select m.id from exam_marking m inner JOIN (SELECT work_id,user_id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1) t
             on t. work_id = m.work_id and t.user_id = m.user_id
             where m.id not in (SELECT MIN(id) AS id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1)
             and m.is_delete = 0
        ) as a);
        --给志强和建国的end-


    -- 可以按指定的考试id来处理 (实测处理marking表就行)
    select id from exams_work where enterprise_id ="jiangsudipukejigufen" and is_delete = 0\g;
    +------+
    | id   |
    +------+
    | 2925 |
    | 3646 |
    | 3653 |
    | 3654 |
    | 3664 |
    | 3738 |
    | 3749 |

    -- 由于需要同时按 work_id,user_id ,来限制范围,这里有个关键的 JOIN (SELECT work_id,user_id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1) t。不能分别where in work_id and where in user_id,这会串到其他数据
    -- 以下处理容器维度的数据
    select DISTINCT(count(m.id)) from exam_marking m inner JOIN (SELECT work_id,user_id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1) t
    on t. work_id = m.work_id and t.user_id = m.user_id
    where m.id not in (SELECT MIN(id) AS id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1)
    and m.is_delete = 0
    and m.work_id in (select id from exams_work where enterprise_id ="jiangsudipukejigufen" and is_delete = 0) ;

    update exam_marking set is_delete = 1 where id in (select id from (select m.id from exam_marking m inner JOIN (SELECT work_id,user_id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1) t
    on t. work_id = m.work_id and t.user_id = m.user_id
    where m.id not in (SELECT MIN(id) AS id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1)
    and m.is_delete = 0
    and m.work_id in (select id from exams_work where enterprise_id ="jiangsudipukejigufen" and is_delete = 0 )
    ) as a);

    线上这个容器只有这一条考试 ,日志从 2022-02-24 16:27:14,788 [pool-4-thread-8] DEBUG c.s.e.m.e.E.batchInsert [BaseJdbcLogger.java : 143] 开始看

    synchronized (lockGenExamMemberAndMarking) { 如果事务未提交的事务,这里可能也不行 。从日志上,一个子线程,一个主线程,如果没有事务,这里也会按主线程执行

    还需要特别注意 SELECT MIN(id) AS id FROM exam_marking GROUP BY work_id,user_id HAVING COUNT(1)>1 拿到的是不是最小id,另外参考 《SQL-按字段group by分组 并 返回最新的结果》 
    结论似乎就是min max 是取group by 的最小最大值的,而如果在 group by 后加ORDER BY,这也仅仅是最group by后的结果进行排序,拿到的不是最小最大值。
    如果需要对group by 前的结果进行过滤,就在group by 前 加where,
    如果需要对group by 后的结果再做筛选,就加having。
正文到此结束
本文目录