Run Code
|
API
|
Code Wall
|
Misc
|
Feedback
|
Login
|
Theme
|
Privacy
|
Patreon
Movie Rating SQL Analysis Conducted by Miranda Zhao
-- Movie Rating SQL Analysis Conducted by Miranda Zhao ----------------------------------------------------------------------- -- database Script CREATE TABLE Movie ("mid" int, "title" varchar(25),"year" int, "director" varchar(25)) ; INSERT INTO Movie ("mid", "title", "year", "director") VALUES (101, 'Gone with the Wind', 1939, 'Victor Fleming'), (102, 'Star Wars', 1977, 'George Lucas'), (103, 'The Sound of Music', 1965, 'Robert Wise'), (104, 'E.T.', 1982, 'Steven Spielberg'), (105, 'Titanic', 1997, 'James Cameron'), (106, 'Snow White', 1937, 'NULL'), (107, 'Avatar', 2009, 'James Cameron'), (108, 'Raiders of the Lost Ark', 1981, 'Steven Spielberg'); CREATE TABLE Reviewer ("rid" int, "name" varchar(25)) ; INSERT INTO Reviewer ("rid", "name") VALUES (201, 'Sarah Martinez'), (202, 'Daniel Lewis'), (203, 'Brittany Harris'), (204, 'Mike Anderson'), (205, 'Chris Jackson'), (206, 'Elizabeth Thomas'), (207, 'James Cameron'), (208, 'Ashley White'); CREATE TABLE Ratings ("rid" int, "mid" int, "stars" int,"ratingdate" timestamp) ; INSERT INTO Ratings ("rid" , "mid" , "stars", "ratingdate") VALUES (201, 101, 2, '2011-01-22'), (201, 101, 4, '2011-01-27'), (202, 106, 4, NULL), (203, 103, 2, '2011-01-20'), (203, 108, 4, '2011-01-12'), (203, 108, 2, '2011-01-30'), (204, 101, 3, '2011-01-09'), (205, 103, 3, '2011-01-27'), (205, 104, 2, '2011-01-22'), (205, 108, 4, NULL), (206, 107, 3, '2011-01-15'), (206, 106, 5, '2011-01-19'), (207, 107, 5, '2011-01-20'), (208, 104, 3, '2011-01-02'); ----------------------------------------------------------------------- --Insight Script: --#0: view table: select * from Movie; select * from Reviewer; select * from Ratings; --#1: remove duplicates: --for all pairs of reviwers such that both reviewers gave a rating to the same movie --return the names of both reviewers. --Eliminate duplicates, dont' pair reviwers with themselves, and include each pair only once --for each pair, return the names in the pair in alphabetical order. --Method1: select distinct rt1.rid as rid1, rt2.rid as rid2, rv1.name as rid1name, rv2.name as rid2name from ratings rt1 join ratings rt2 on rt1.mid = rt2.mid join reviewer rv1 on rt1.rID = rv1.rID join reviewer rv2 on rt2.rID = rv2.rID where rt1.rid < rt2.rid order by rid1; -- Method 2: select distinct rv1.name, rv2.name from ratings rt1 join ratings rt2 on rt1.mID = rt2.mID join Reviewer rv1 on rt1.rID = rv1.rID join Reviewer rv2 on rt2.rID = rv2.rID where rv1.name < rv2.name order by rv1.name; --I. 避免重复的3个地方:只出现要的变量,distinct, where A>B(ID, name均可) -- ①只出现2个rid, 让select后不能出现两个mid以避免重复: -- 因为两个人同时给两个不同电影都评论(203-205, 108; 203-205, 103),会重复。 -- ②只出现2个rid,还要选distinct以避免重复: -- 因为两个人可以同时评论一个电影,但是其中一个给同一个电影评了2次,(201-204, 101) (201-204,101)。 -- ③用where A>B 来避免本身记录的重复。 --II.正确做法在自己join自己后,又将rating table连到了reviwer table 让rid指向了rname. -- Q6: Find the movie that has the lowest stars in the database, -- return the reviewer name, movie title, and number of stars. -- ※ 3各表join后,可以用star = select min(star); where rt.stars = (select min(stars) from ratings); select rv.name, mv.title, rt.stars from movie mv join ratings rt on mv.mID = rt.mID join reviewer rv on rt.rID = rv.rID where rt.stars = (select min(stars) from ratings); -- Q6: Find the movie that reviwer gives the lowest star for each reviwer. --return the reviewer name, movie title, and number of stars. (partition by rv.name) with cte as( select rv.name, rt.mid, mv.title, rt.stars, row_number() over (partition by rv.name order by rt.stars) as movie_rank from Ratings rt join Movie mv on rt.mid = mv.mid join Reviewer rv on rt.rid = rv.rid order by movie_rank) select * from cte where movie_rank = 1; -- Q6: Find the lowerst star for each movie. -- return the reviewer name, movie title, and number of stars. (partition by rt.mid/mv.title) --method 1: with cte as( select rv.name, rt.mid, mv.title, rt.stars, row_number() over (partition by mv.title order by rt.stars) as movie_rank from Ratings rt join Movie mv on rt.mid = mv.mid join Reviewer rv on rt.rid = rv.rid order by movie_rank) select * from cte where movie_rank = 1 order by mid; --method 2 with cte as( select rv.name, rt.mid, mv.title, rt.stars, row_number() over (partition by rt.mid order by rt.stars) as movie_rank from Ratings rt join Movie mv on rt.mid = mv.mid join Reviewer rv on rt.rid = rv.rid order by movie_rank) select * from cte where movie_rank = 1 order by mid; -- Q7: List movie titles and avg ratings, from highest rated to lowest-rated. -- If 2 or more movies have the same avg rating, list them in alphabetical order. select title, avg(stars) as avg from movie mv join ratings rt on mv.mID = rt.mID group by 1 order by 2 DESC, 1; -- method 2: select mv.title, rt.mid, sum(rt.stars)/ count(rt.rid) as avg_movie_rating from movie mv join ratings rt on mv.mid = rt.mid group by mv.title, rt.mid order by 3 DESC, 1; --order是发生在select之后的,就已经创建完了,所以可以用了 --一般having时还没有创建完,不能用 --Q8: find the names of all reviewers who have contributed 3 or more ratings --method 1: group by + having 选取两个join + groupby 表的筛选条件:having count(*) select rv.name from reviewer rv join ratings rt on rv.rID = rt.rID group by rv.name having count(*) >=3; --method 2: group by + having 选取两个join + groupby 表的筛选条件:having count(rt.stars) -- having count(*) = having count(rt.stars)可筛条件; 新建列rating_times无法直接having用 select rv.name, count(rt.stars) as rating_times from reviewer rv join ratings rt on rv.rid = rt.rid group by 1 having count(rt.stars) >= 3 order by 2 DESC; --method 3: 先出一个表有name + rating_times, 再用cte筛选新增列的条件,分两步走 --新建列rating_times可以在cte基础上使用 select * from (select rv.name, count(rt.stars) as rating_times from reviewer rv join ratings rt on rv.rid = rt.rid group by 1 order by 2 DESC) as cte where rating_times >= 3; --Q9: some directors directed more than 1 movie. For all such directors, -- return the titles of all movies directed by them, along with the director name -- sort by director name, then movie title -- 电影,导演(导了>1部的) --Method 1: self join on mv1.d = mv2.d + where mv1.title != mv2.title -->挑出只有同导不同电影的记录 --self join 把相同director的相同movie挑出来了(mid=mid, director1 = director2), --self join 也把相同director但不同movie的挑出来了(mid≠mid, director1 = director2) select mv1.mID, mv2.Mid, mv1.title, mv1.director from movie mv1 join movie mv2 on mv1.director = mv2.director where mv1.title != mv2.title order by mv1.director, mv1.title; --Method 2: --self join超级简洁,不用新建table, 自己可以完成挑选设定A变量同,能看到有多少B变量, --having可以把同A不同B的记录挑出来(having count(*) >2 ) select mv1.mID, mv1.director, count(*) as direct_times from movie mv1 join movie mv2 on mv1.director = mv2.director group by 1,2 having count(*) >=2 order by 2,1; --分解: --①self join on mv1.director = mv2.director --把相同director的相同movie挑出来了(mid=mid, director1 = director2), --也把相同director但不同movie的挑出来了(mid≠mid, director1 = director2) select mv1.mID, mv2.mID, mv1.director, mv2.director from movie mv1 join movie mv2 on mv1.director = mv2.director; --②use group by (mid,director) to find direct_times per movie --在group by mid, director的情况下,用count(director)/count(*)出现几次来计算导戏>1次的导演 select mv1.mid, mv1.director, count(mv1.director) as direct_times1, count(*) as direct_times2 from movie mv1 join movie mv2 on mv1.director = mv2.director group by 1,2 having count(*) >=2; --Method 2: --在单表内:count(列名) = count(*) , 均可以做筛选条件 --在cte table内having筛选条件,cte外加挑选变量 with cte as (select director, count(director) as direct_times1, count(*) as direct_times2 from movie mv group by 1 having count(*) >=2) select mv.mid, cte.director, cte.direct_times1, cte.direct_times2 from cte join movie mv on cte.director = mv.director order by 2; --Method 3: --在cte table外where筛选条件,cte外加挑选变量 with cte as (select director, count(director) as direct_times1, count(*) as direct_times2 from movie mv group by 1) select mv.mid, cte.director, cte.direct_times1, cte.direct_times2 from cte join movie mv on cte.director = mv.director where direct_times1 >=2 order by 2; --Q10: --Find the movies with highest average rating. Return the movie titles and average rating. --Method1: with cte as ( select mID, sum(rt.stars) / count(rt.rid) as avg_stars from ratings rt group by mID) select cte.mID, mv.title, cte.avg_stars from cte join movie mv on cte.mID = mv.mID where avg_stars = (select max(avg_stars) from cte); --Method2: with cte as (select rt.mid, mv.title, sum(rt.stars) / count(rt.rid) as avg_rating from ratings rt join Movie mv on rt.mid = mv.mid group by 1,2 order by 3 DESC) select * from cte where avg_rating = (select max(avg_rating) from cte); --Method3: select mv.mID, avg(stars) as avg_stars from ratings rt join movie mv on mv.mID = rt.mID group by mv.mID having avg(stars) = (select max(avg_stars) from ( select mID, avg(stars) AS avg_stars from ratings group by mID ) tmp ); select mv.mID, sum(rt.stars) / count(rt.rid) as avg_stars from ratings rt join movie mv on mv.mID = rt.mID group by mv.mID having sum(rt.stars) / count(rt.rid) = (select max(avg_stars) from (select mID, sum(rt.stars) / count(rt.rid) AS avg_stars from ratings rt group by mID)as cte); -- 11. For each director, return the director's name together with the titles of the movies -- they directed that received the highest rating among all of their movies, -- and the value of that rating. Ignore movies whose director is NULL --Method 1 SELECT * FROM (select mv.mid, mv.title, mv.director, rt.stars, dense_rank() over (partition by director order by stars DESC) as row_num from Ratings rt join Movie mv on mv.mID = rt.mID group by 1,2,3,4) AS CTE WHERE row_num = 1 order by 4 DESC; -- 同一导演的最高评分电影,应该用dense_rank而不是row_num, 因为row_num即使相同高分,名次也会顺延 --一般在cte table里的的逗号容易多加或少加; -- cte也可称作temp -- cte里可以直接去null(is not null) -- 在主query中可以直接call变量名不用table名 -- Method2 select director, title, stars from (select distinct mv.director, mv.title, rt.stars, dense_rank() over (partition by director order by stars desc) as stars_rank from ratings rt join movie mv on mv.mID = rt.mID where mv.director is not null) as tmp where stars_rank = 1 order by director, title; --Q4: Find the titles of all movies not reviewed by Chris Jackson --method1: --先将3个表join起来,让rid不等于CJ的rid --只要不在select后挑选过多变量,不会出现distinct后仍有很多title重复的情况 --where rid != (...) select distinct mv.title from movie mv join ratings rt on mv.mid = rt.mid join reviewer rv on rv.rid = rt.rid where rt.rid != (select rid from Reviewer where name = 'Chris Jackson'); --method2: --先将rt和rvjoin起来,找到CJ的mid; 再在movie中去掉这部分mid: where mid not in(...) --【这里将not in改成!=就会报错:more than one row returned by a subquery used as an expression】 -- not in 可以后接多个值, !=只能后接一个值 select title from movie where mid not in( select rt.mid from ratings rt join reviewer rv on rt.rID = rv.rID where rv.name = 'Chris Jackson'); --method3: --left join, 左表mv.title, 右表temp是CJ看过的mid, join on mid, 挑选出mid是空值的: --where tmp.mID is null select title from movie mv left join ( select rt.mID from ratings rt join reviewer rv on rt.rID = rv.rID where rv.name = 'Chris Jackson') tmp on mv.mID = tmp.mID where tmp.mID is NULL
run
|
edit
|
history
|
help
0
Yandex Task
Return IDs where there are exactly 3 occurrences1
Timestamps in PostgreSQL
Webiste SQL Analysis
tic tac toe
Dashes in numbers
nested array to table
PostreSQL: Subquery
Black Wall
Krug_test