All you need to know about SQL in a data science interview
3 min readSep 27, 2024
In this article, I will delve into the essential SQL scripts that are crucial for acing a data science interview. Mastering these scripts will not only enhance your technical skills but also boost your confidence during the interview process. I will cover a range of topics, from basic queries to advanced techniques, ensuring you are well-prepared to tackle any SQL-related question that comes your way.
Basic SQL queries
SELECT descript, time
COALESCE(descript, 'No Description') -- returning the first non-NULL value from a list of expressions
FROM tutorial.sf_crime_incidents_cleandate
ORDER BY descript DESC, time
-------------------------------------------------------
SELECT select_list1
FROM table_name1
MINUS / UNION / / UNION ALL / INTERSECT
SELECT select_list2
FROM table_name2;
-------------------------------------------------------
SELECT companies.category_code,
SUM(employee_count),
MIN(employee_count),
MAX(employee_count),
AVG(employee_count),
COUNT(CASE WHEN acquisitions.acquired_at_cleaned <= companies.founded_at_clean::timestamp + INTERVAL '3 years'
THEN 1 ELSE NULL END) AS acquired_3_yrs,
COUNT(CASE WHEN acquisitions.acquired_at_cleaned <= companies.founded_at_clean::timestamp + INTERVAL '5 years'
THEN 1 ELSE NULL END) AS acquired_5_yrs,
COUNT(CASE WHEN acquisitions.acquired_at_cleaned <= companies.founded_at_clean::timestamp + INTERVAL '10 years'
THEN 1 ELSE NULL END) AS acquired_10_yrs,
COUNT(1) AS total
FROM tutorial.crunchbase_companies_clean_date companies
JOIN tutorial.crunchbase_acquisitions_clean_date acquisitions
ON acquisitions.company_permalink = companies.permalink
WHERE founded_at_clean IS NOT NULL
GROUP BY 1
ORDER BY 5 DESC
Joins and Sub Queries
SELECT *
FROM tutorial.sf_crime_incidents_2014_01
WHERE Date IN (SELECT date
FROM tutorial.sf_crime_incidents_2014_01
ORDER BY date
LIMIT 5
)
-------------------------------------------------------
WITH tmp AS
(
SELECT *
FROM
table1 LEFT JOIN table2
ON table1.id = table2.id
)
SELECT table3.*
FROM
tmp INNER JOIN table3
ON tmp.id = table3.id
WHERE
tmp.date = '2024-01-01'
String Manipulation
SELECT date,
LEFT(date, 10) AS cleaned_date,
RIGHT(date, LENGTH(date) - 11) AS cleaned_time,
SUBSTR(date, 4, 2) AS day,
TRIM(both '()' FROM location),
POSITION('A' IN descript) AS a_position,
TRIM(leading '(' FROM LEFT(location, POSITION(',' IN location) - 1)) AS lattitude,
TRIM(trailing ')' FROM RIGHT(location, LENGTH(location) - POSITION(',' IN location) ) ) AS longitude,
CONCAT(day_of_week, ', ', LEFT(date, 10)) AS day_and_date,
day_of_week || ', ' || LEFT(date, 10) AS day_and_date2,
UPPER(address) AS address_upper,
LOWER(address) AS address_lower
FROM tutorial.sf_crime_incidents_2014_01
Date and Time Functions
SELECT DATEDIFF(year, '2017/08/25', '2011/08/25') AS DateDiff;
SELECT DATE_ADD('2021-01-01', 3)
--'2021-01-04'
SELECT
companies.founded_at_clean::timestamp + INTERVAL '1 week' AS plus_one_week,
NOW() - companies.founded_at_clean::timestamp AS founded_time_ago
FROM tutorial.crunchbase_companies_clean_date companies;
-------------------------------------------------------
SELECT cleaned_date,
EXTRACT('year' FROM cleaned_date) AS year,
EXTRACT('month' FROM cleaned_date) AS month,
EXTRACT('day' FROM cleaned_date) AS day,
EXTRACT('hour' FROM cleaned_date) AS hour,
EXTRACT('minute' FROM cleaned_date) AS minute,
EXTRACT('second' FROM cleaned_date) AS second,
EXTRACT('decade' FROM cleaned_date) AS decade,
EXTRACT('dow' FROM cleaned_date) AS day_of_week
FROM tutorial.sf_crime_incidents_cleandate;
-------------------------------------------------------
SELECT cleaned_date,
DATE_TRUNC('year' , cleaned_date) AS year,
DATE_TRUNC('month' , cleaned_date) AS month,
DATE_TRUNC('week' , cleaned_date) AS week,
DATE_TRUNC('day' , cleaned_date) AS day,
DATE_TRUNC('hour' , cleaned_date) AS hour,
DATE_TRUNC('minute' , cleaned_date) AS minute,
DATE_TRUNC('second' , cleaned_date) AS second,
DATE_TRUNC('decade' , cleaned_date) AS decade
FROM tutorial.sf_crime_incidents_cleandate;
-------------------------------------------------------
SELECT CURRENT_DATE AS date,
CURRENT_TIME AS time,
CURRENT_TIME AT TIME ZONE 'PST' AS time_pst,
CURRENT_TIMESTAMP AS timestamp,
LOCALTIME AS localtime,
LOCALTIMESTAMP AS localtimestamp,
NOW() AS now,
NOW() AT TIME ZONE 'PST' AS now_pst
;
-------------------------------------------------------
Window Functions
SELECT
duration_seconds,
SUM(duration_seconds) OVER (ORDER BY start_time) AS running_total
FROM tutorial.dc_bikeshare_q1_2012
-------------------------------------------------------
SELECT
start_terminal,
duration_seconds,
ROW_NUMBER() OVER (ORDER BY start_time) AS row,
ROW_NUMBER() OVER (PARTITION BY start_terminal ORDER BY start_time) AS row_number,
RANK() OVER (PARTITION BY start_terminal ORDER BY start_time) AS rank,
DENSE_RANK() OVER (PARTITION BY start_terminal ORDER BY start_time) AS dense_rank,--The DENSE_RANK function is similar to the RANK function however the DENSE_RANK function does not skip any ranks if there is a tie between the ranks of the preceding records.
SUM(duration_seconds) OVER (PARTITION BY start_terminal ORDER BY start_time) AS running_total,
COUNT(duration_seconds) OVER (PARTITION BY start_terminal ORDER BY start_time) AS running_count,
AVG(duration_seconds) OVER (PARTITION BY start_terminal ORDER BY start_time) AS running_avg
(duration_seconds/SUM(duration_seconds) OVER (PARTITION BY start_terminal))*100 AS pct_of_total_time,
NTILE(4) OVER (PARTITION BY start_terminal ORDER BY duration_seconds) AS quartile,
NTILE(100) OVER (PARTITION BY start_terminal ORDER BY duration_seconds) AS percentile,
LAG(duration_seconds, 1) OVER (PARTITION BY start_terminal ORDER BY duration_seconds) AS lag,
LEAD(duration_seconds, 1) OVER (PARTITION BY start_terminal ORDER BY duration_seconds) AS lead,
duration_seconds - LAG(duration_seconds, 1) OVER (PARTITION BY start_terminal ORDER BY duration_seconds) AS difference
FROM tutorial.dc_bikeshare_q1_2012
Other
select
round(630/60.0,2),
cast(round(630/60.0,2) as numeric(36,2))
--10.500000 10.50
SELECT CAST(funding_total_usd AS varchar) AS funding_total_usd_string,
founded_at_clean::varchar AS founded_at_string
companies.founded_at_clean::timestamp AS time_to_acquisition
FROM tutorial.crunchbase_companies_clean_date