Jump to content

User:Topbanana/Reports/This page contains a link that might be mis-punctuated/SQL

From Wikipedia, the free encyclopedia

The SQL below should be run on a Link Analysis Database. It creates a number of extra tables to aid in the analysis of 'close matches' for red links.



-- Create tables and copy bad link targets and article titles

CREATE TABLE bad_squish (
    from_id mediumint(7) unsigned NOT NULL,
    orig_title varchar(255) binary NOT NULL,
    squish_title varchar(255) binary NOT NULL,

	PRIMARY KEY( from_id, orig_title )
) ENGINE=MyISAM;

CREATE TABLE art_squish (
    orig_title varchar(255) binary NOT NULL,
    squish_title varchar(255) binary NOT NULL,

	PRIMARY KEY( orig_title )
) ENGINE=MyISAM;

INSERT INTO art_squish
SELECT art_title, art_title
FROM art;

INSERT INTO bad_squish
SELECT from_id, link, link
FROM bad_links;

DELETE FROM art_squish
WHERE Length( orig_title  ) <=3;

DELETE FROM bad_squish
WHERE Length( orig_title  ) <=3;

-- Now progressively mangle the link targets and article titles, removing matches
-- as we go.

-- Remove punctuation and double spaces

UPDATE bad_squish SET squish_title = replace( squish_title, '(', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '(', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, ')', '' );
UPDATE art_squish SET squish_title = replace( squish_title, ')', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '[', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '[', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, ']', '' );
UPDATE art_squish SET squish_title = replace( squish_title, ']', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, ',', '' );
UPDATE art_squish SET squish_title = replace( squish_title, ',', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '!', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '!', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '"', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '"', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '%', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '%', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '&', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '&', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '*', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '*', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '-', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '-', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '+', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '+', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '=', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '=', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '{', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '{', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '}', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '}', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, ':', '' );
UPDATE art_squish SET squish_title = replace( squish_title, ':', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, ';', '' );
UPDATE art_squish SET squish_title = replace( squish_title, ';', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '@', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '@', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '#', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '#', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '<', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '<', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '>', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '>', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '.', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '.', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '/', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '/', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '\\', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '\\', '' );

UPDATE bad_squish SET squish_title = replace( squish_title, '''', '' );
UPDATE art_squish SET squish_title = replace( squish_title, '''', '' );

ALTER TABLE bad_squish ADD INDEX ( squish_title );
ALTER TABLE art_squish ADD INDEX ( squish_title );

SELECT concat( '*[[',art_title,']] links to [[', bad_squish.orig_title, ']], try ', group_concat( concat( '[[', art_squish.orig_title, ']]' )) )
INTO OUTFILE 'c:\punc.txt'
FROM art_squish, bad_squish, art
WHERE art.art_id = bad_squish.from_id
AND   bad_squish.squish_title = art_squish.squish_title
GROUP BY bad_squish.from_id, bad_squish.orig_title
ORDER BY art_title, bad_squish.orig_title;