Movie Rating SQL Analysis Conducted by Miranda Zhao, PostgreSQL

Movie Rating SQL Analysis Conducted by Miranda Zhao

-- Movie Rating SQL Analysis Conducted by Miranda Zhao

-----------------------------------------------------------------------
-- database Script

CREATE TABLE Movie
    ("mid" int, "title" varchar(25),"year" int, "director" varchar(25))
;
    
INSERT INTO Movie
    ("mid", "title", "year", "director")
VALUES
    (101, 'Gone with the Wind', 1939, 'Victor Fleming'),
    (102, 'Star Wars', 1977, 'George Lucas'),
    (103, 'The Sound of Music', 1965, 'Robert Wise'),
    (104, 'E.T.', 1982, 'Steven Spielberg'),
    (105, 'Titanic', 1997, 'James Cameron'),
    (106, 'Snow White', 1937, 'NULL'),
    (107, 'Avatar', 2009, 'James Cameron'),
    (108, 'Raiders of the Lost Ark', 1981, 'Steven Spielberg');

CREATE TABLE Reviewer
    ("rid" int, "name" varchar(25))
;
    
INSERT INTO Reviewer
    ("rid", "name")
VALUES
    (201, 'Sarah Martinez'),
    (202, 'Daniel Lewis'),
    (203, 'Brittany Harris'),
    (204, 'Mike Anderson'),
    (205, 'Chris Jackson'),
    (206, 'Elizabeth Thomas'),
    (207, 'James Cameron'),
    (208, 'Ashley White');

CREATE TABLE Ratings
    ("rid" int, "mid" int,  "stars" int,"ratingdate" timestamp)
;
    
INSERT INTO Ratings
    ("rid" , "mid" ,  "stars", "ratingdate")
    
VALUES
    (201, 101, 2, '2011-01-22'),
    (201, 101, 4, '2011-01-27'),
    (202, 106, 4,  NULL),
    (203, 103, 2, '2011-01-20'),
    (203, 108, 4, '2011-01-12'),
    (203, 108, 2, '2011-01-30'),
    (204, 101, 3, '2011-01-09'),
    (205, 103, 3, '2011-01-27'),  
    (205, 104, 2, '2011-01-22'),  
    (205, 108, 4,  NULL),  
    (206, 107, 3, '2011-01-15'),    
    (206, 106, 5, '2011-01-19'),    
    (207, 107, 5, '2011-01-20'),
    (208, 104, 3, '2011-01-02');

-----------------------------------------------------------------------
--Insight Script:

--#0: view table:
select * from Movie;

select * from Reviewer;

select * from Ratings;

--#1: remove duplicates:
--for all pairs of reviwers such that both reviewers gave a rating to the same movie
--return the names of both reviewers. 
--Eliminate duplicates, dont' pair reviwers with themselves, and include each pair only once
--for each pair, return the names in the pair in alphabetical order.

--Method1:
select  distinct rt1.rid as rid1, 
                 rt2.rid as rid2,
                 rv1.name as rid1name,
                 rv2.name as rid2name
from ratings rt1
join ratings rt2 
on rt1.mid = rt2.mid
join reviewer rv1 on rt1.rID = rv1.rID
join reviewer rv2 on rt2.rID = rv2.rID
where rt1.rid < rt2.rid
order by rid1;

-- Method 2:
select distinct rv1.name, rv2.name
from ratings rt1
join ratings rt2
on rt1.mID = rt2.mID
join Reviewer rv1 on rt1.rID = rv1.rID
join Reviewer rv2 on rt2.rID = rv2.rID
where rv1.name < rv2.name
order by rv1.name;

--I. 避免重复的3个地方：只出现要的变量，distinct, where A>B（ID, name均可）

-- ①只出现2个rid, 让select后不能出现两个mid以避免重复:
-- 因为两个人同时给两个不同电影都评论（203-205, 108; 203-205, 103），会重复。

-- ②只出现2个rid，还要选distinct以避免重复：
-- 因为两个人可以同时评论一个电影，但是其中一个给同一个电影评了2次，(201-204, 101) (201-204,101)。

-- ③用where A>B 来避免本身记录的重复。
--II.正确做法在自己join自己后，又将rating table连到了reviwer table 让rid指向了rname.

-- Q6: Find the movie that has the lowest stars in the database, 
-- return the reviewer name, movie title, and number of stars. 
-- ※ 3各表join后，可以用star = select min(star); where rt.stars = (select min(stars) from ratings);

select rv.name, mv.title, rt.stars
from movie mv
join ratings rt on mv.mID = rt.mID
join reviewer rv on rt.rID = rv.rID
where rt.stars = (select min(stars) from ratings);

-- Q6: Find the movie that reviwer gives the lowest star for each reviwer.
--return the reviewer name, movie title, and number of stars. (partition by rv.name) 
with cte as(
    
select rv.name, rt.mid, mv.title, rt.stars,
row_number() over (partition by rv.name order by rt.stars) as movie_rank
from Ratings rt
join Movie mv on rt.mid = mv.mid
join Reviewer rv on rt.rid = rv.rid
order by movie_rank)
select * from cte
where movie_rank = 1;

-- Q6: Find the lowerst star for each movie.
-- return the reviewer name, movie title, and number of stars. (partition by rt.mid/mv.title)

--method 1:
with cte as(
select rv.name, rt.mid, mv.title, rt.stars,
row_number() over (partition by mv.title order by rt.stars) as movie_rank
from Ratings rt
join Movie mv on rt.mid = mv.mid
join Reviewer rv on rt.rid = rv.rid
order by movie_rank)
select * from cte
where movie_rank = 1
order by mid;

--method 2
with cte as(
select rv.name, rt.mid, mv.title, rt.stars,
row_number() over (partition by rt.mid order by rt.stars) as movie_rank
from Ratings rt
join Movie mv on rt.mid = mv.mid
join Reviewer rv on rt.rid = rv.rid
order by movie_rank)
select * from cte
where movie_rank = 1
order by mid;

-- Q7: List movie titles and avg ratings, from highest rated to lowest-rated. 
--     If 2 or more movies have the same avg rating, list them in alphabetical order.

select title, avg(stars) as avg
from movie mv 
join ratings rt on mv.mID = rt.mID
group by 1
order by 2 DESC, 1;

-- method 2:
select mv.title, rt.mid, sum(rt.stars)/ count(rt.rid) as avg_movie_rating
from movie mv
join ratings rt on mv.mid = rt.mid
group by mv.title, rt.mid
order by 3 DESC, 1;

--order是发生在select之后的，就已经创建完了，所以可以用了
--一般having时还没有创建完，不能用

--Q8: find the names of all reviewers who have contributed 3 or more ratings

--method 1: group by + having 选取两个join + groupby 表的筛选条件：having count(*)
select rv.name
from reviewer rv
join ratings rt on rv.rID = rt.rID
group by rv.name
having count(*) >=3;

--method 2: group by + having 选取两个join + groupby 表的筛选条件：having count(rt.stars)
-- having count(*) = having count(rt.stars)可筛条件； 新建列rating_times无法直接having用
select rv.name, count(rt.stars) as rating_times
from reviewer rv
join ratings rt on rv.rid = rt.rid
group by 1
having count(rt.stars) >= 3
order by 2 DESC;

--method 3: 先出一个表有name + rating_times, 再用cte筛选新增列的条件，分两步走
--新建列rating_times可以在cte基础上使用
select * from
(select rv.name, count(rt.stars) as rating_times
from reviewer rv
join ratings rt on rv.rid = rt.rid
group by 1
order by 2 DESC) as cte
where rating_times >= 3;

--Q9: some directors directed more than 1 movie. For all such directors,
-- return the titles of all movies directed by them, along with the director name
-- sort by director name, then movie title
-- 电影，导演（导了>1部的）

--Method 1: self join on mv1.d = mv2.d + where mv1.title != mv2.title -->挑出只有同导不同电影的记录
--self join 把相同director的相同movie挑出来了(mid=mid, director1 = director2)，
--self join 也把相同director但不同movie的挑出来了(mid≠mid, director1 = director2）

select mv1.mID, mv2.Mid, mv1.title, mv1.director
from movie mv1 
join movie mv2 
on mv1.director = mv2.director 
where mv1.title != mv2.title
order by mv1.director, mv1.title;

--Method 2:
--self join超级简洁，不用新建table, 自己可以完成挑选设定A变量同，能看到有多少B变量，
--having可以把同A不同B的记录挑出来（having count(*) >2 ）
select mv1.mID, mv1.director, count(*) as direct_times
from movie mv1
join movie mv2 
on mv1.director = mv2.director 
group by 1,2
having count(*) >=2
order by 2,1;

--分解：
--①self join on mv1.director = mv2.director
--把相同director的相同movie挑出来了(mid=mid, director1 = director2)，
--也把相同director但不同movie的挑出来了(mid≠mid, director1 = director2）
select mv1.mID, mv2.mID, mv1.director, mv2.director
from movie mv1
join movie mv2 
on mv1.director = mv2.director;

--②use group by (mid,director) to find direct_times per movie
--在group by mid, director的情况下，用count(director)/count(*)出现几次来计算导戏>1次的导演
select mv1.mid, mv1.director, count(mv1.director) as direct_times1, count(*) as direct_times2
from movie mv1
join movie mv2 
on mv1.director = mv2.director
group by 1,2
having count(*) >=2;

--Method 2:
--在单表内：count(列名) = count(*) , 均可以做筛选条件
--在cte table内having筛选条件，cte外加挑选变量

with cte as
(select director, count(director) as direct_times1, count(*) as direct_times2
from movie mv
group by 1
having count(*) >=2)
select mv.mid, cte.director, cte.direct_times1, cte.direct_times2
from cte 
join movie mv on cte.director = mv.director
order by 2;

--Method 3:
--在cte table外where筛选条件，cte外加挑选变量
with cte as
(select director, count(director) as direct_times1, count(*) as direct_times2
from movie mv
group by 1)
select mv.mid, cte.director, cte.direct_times1, cte.direct_times2
from cte 
join movie mv on cte.director = mv.director
where direct_times1 >=2
order by 2;

--Q10: 
--Find the movies with highest average rating. Return the movie titles and average rating.

--Method1:

with cte as (
select mID, sum(rt.stars) / count(rt.rid) as avg_stars
from ratings rt 
group by mID)
select cte.mID, mv.title, cte.avg_stars
from cte
join movie mv on cte.mID = mv.mID 
where avg_stars = (select max(avg_stars) from cte);

--Method2:
with cte as 
(select rt.mid, mv.title, sum(rt.stars) / count(rt.rid) as avg_rating
from ratings rt
join Movie mv
on rt.mid = mv.mid
group by 1,2
order by 3 DESC)
select * from cte
where avg_rating = (select max(avg_rating) from cte);

--Method3:
select mv.mID, avg(stars) as avg_stars
from ratings rt join movie mv
on mv.mID = rt.mID 
group by mv.mID
having avg(stars) = 
                    (select max(avg_stars)
                     from
                      (
                       select mID, avg(stars) AS avg_stars
                       from ratings 
                       group by mID
                       ) tmp
                     );
 
 
select mv.mID, sum(rt.stars) / count(rt.rid)  as avg_stars
from ratings rt join movie mv
on mv.mID = rt.mID 
group by mv.mID
having sum(rt.stars) / count(rt.rid) = 
(select max(avg_stars)
from
(select mID, sum(rt.stars) / count(rt.rid)  AS avg_stars
from ratings rt
group by mID)as cte);

-- 11. For each director, return the director's name together with the titles of the movies
-- they directed that received the highest rating among all of their movies, 
-- and the value of that rating. Ignore movies whose director is NULL

--Method 1
SELECT * FROM
(select mv.mid, mv.title, mv.director, rt.stars,
       dense_rank() over (partition by director order by stars DESC) as row_num
from Ratings rt
join Movie mv
on mv.mID = rt.mID 
group by 1,2,3,4) AS CTE
WHERE row_num = 1 
order by 4 DESC;

-- 同一导演的最高评分电影，应该用dense_rank而不是row_num, 因为row_num即使相同高分，名次也会顺延
--一般在cte table里的的逗号容易多加或少加；
-- cte也可称作temp
-- cte里可以直接去null(is not null)
-- 在主query中可以直接call变量名不用table名

-- Method2
select director, title, stars
from 
(select distinct mv.director, mv.title, rt.stars,
 dense_rank() over (partition by director order by stars desc) as stars_rank
 from ratings rt join movie mv on mv.mID = rt.mID
 where mv.director is not null) as tmp 
 where stars_rank = 1
 order by director, title;

--Q4: Find the titles of all movies not reviewed by Chris Jackson
--method1:
--先将3个表join起来，让rid不等于CJ的rid
--只要不在select后挑选过多变量，不会出现distinct后仍有很多title重复的情况
--where rid != (...)
select distinct mv.title
from movie mv join ratings rt on mv.mid = rt.mid
join reviewer rv on rv.rid = rt.rid
where rt.rid != (select rid from Reviewer where name = 'Chris Jackson');

--method2: 
--先将rt和rvjoin起来，找到CJ的mid; 再在movie中去掉这部分mid: where mid not in(...)
--【这里将not in改成!=就会报错：more than one row returned by a subquery used as an expression】
-- not in 可以后接多个值， !=只能后接一个值
select title from movie 
where mid not in(
select rt.mid from ratings rt
join reviewer rv on rt.rID = rv.rID
where rv.name = 'Chris Jackson');

--method3:
--left join, 左表mv.title, 右表temp是CJ看过的mid, join on mid, 挑选出mid是空值的：
--where tmp.mID is null
select title
from movie mv 
left join (
select rt.mID 
from ratings rt
join reviewer rv on rt.rID  = rv.rID 
where rv.name = 'Chris Jackson') tmp
on mv.mID = tmp.mID
where tmp.mID is NULL


run \| edit \| history \| help	0

λ

                                      .NET NoSQL database for rapid development