All you need to know about SQL in a data science interview

3 min readSep 27, 2024

In this article, I will delve into the essential SQL scripts that are crucial for acing a data science interview. Mastering these scripts will not only enhance your technical skills but also boost your confidence during the interview process. I will cover a range of topics, from basic queries to advanced techniques, ensuring you are well-prepared to tackle any SQL-related question that comes your way.

Basic SQL queries

SELECT descript, time
       COALESCE(descript, 'No Description') --  returning the first non-NULL value from a list of expressions
  FROM tutorial.sf_crime_incidents_cleandate
 ORDER BY descript DESC, time


-------------------------------------------------------
SELECT select_list1 
FROM table_name1
MINUS / UNION / / UNION ALL / INTERSECT
SELECT select_list2 
FROM table_name2;



-------------------------------------------------------
SELECT companies.category_code,
       SUM(employee_count), 
       MIN(employee_count),
       MAX(employee_count),
       AVG(employee_count),
       COUNT(CASE WHEN acquisitions.acquired_at_cleaned <= companies.founded_at_clean::timestamp + INTERVAL '3 years'
                       THEN 1 ELSE NULL END) AS acquired_3_yrs,
       COUNT(CASE WHEN acquisitions.acquired_at_cleaned <= companies.founded_at_clean::timestamp + INTERVAL '5 years'
                       THEN 1 ELSE NULL END) AS acquired_5_yrs,
       COUNT(CASE WHEN acquisitions.acquired_at_cleaned <= companies.founded_at_clean::timestamp + INTERVAL '10 years'
                       THEN 1 ELSE NULL END) AS acquired_10_yrs,
       COUNT(1) AS total
  FROM tutorial.crunchbase_companies_clean_date companies
  JOIN tutorial.crunchbase_acquisitions_clean_date acquisitions
    ON acquisitions.company_permalink = companies.permalink
 WHERE founded_at_clean IS NOT NULL
 GROUP BY 1
 ORDER BY 5 DESC

Joins and Sub Queries

SELECT *
  FROM tutorial.sf_crime_incidents_2014_01
 WHERE Date IN (SELECT date
                 FROM tutorial.sf_crime_incidents_2014_01
                ORDER BY date
                LIMIT 5
              )

-------------------------------------------------------

WITH tmp AS
(
SELECT * 
FROM 
table1 LEFT JOIN table2
ON table1.id = table2.id
)
SELECT table3.*
FROM
tmp INNER JOIN table3
ON tmp.id = table3.id
WHERE 
tmp.date = '2024-01-01'

String Manipulation



SELECT date,
       LEFT(date, 10) AS cleaned_date,
       RIGHT(date, LENGTH(date) - 11) AS cleaned_time,
       SUBSTR(date, 4, 2) AS day, 

       TRIM(both '()' FROM location),
       POSITION('A' IN descript) AS a_position,
       
       TRIM(leading '(' FROM LEFT(location, POSITION(',' IN location) - 1)) AS lattitude,
       TRIM(trailing ')' FROM RIGHT(location, LENGTH(location) - POSITION(',' IN location) ) ) AS longitude,

       CONCAT(day_of_week, ', ', LEFT(date, 10)) AS day_and_date,
       day_of_week || ', ' || LEFT(date, 10) AS day_and_date2,
       
       UPPER(address) AS address_upper,
       LOWER(address) AS address_lower
      

  FROM tutorial.sf_crime_incidents_2014_01

Date and Time Functions

SELECT DATEDIFF(year, '2017/08/25', '2011/08/25') AS DateDiff;

SELECT DATE_ADD('2021-01-01', 3)
--'2021-01-04'



SELECT 
companies.founded_at_clean::timestamp + INTERVAL '1 week' AS plus_one_week,
NOW() - companies.founded_at_clean::timestamp AS founded_time_ago
FROM tutorial.crunchbase_companies_clean_date companies;


-------------------------------------------------------
SELECT cleaned_date,
       EXTRACT('year'   FROM cleaned_date) AS year,
       EXTRACT('month'  FROM cleaned_date) AS month,
       EXTRACT('day'    FROM cleaned_date) AS day,
       EXTRACT('hour'   FROM cleaned_date) AS hour,
       EXTRACT('minute' FROM cleaned_date) AS minute,
       EXTRACT('second' FROM cleaned_date) AS second,
       EXTRACT('decade' FROM cleaned_date) AS decade,
       EXTRACT('dow'    FROM cleaned_date) AS day_of_week
  FROM tutorial.sf_crime_incidents_cleandate;

-------------------------------------------------------
SELECT cleaned_date,
       DATE_TRUNC('year'   , cleaned_date) AS year,
       DATE_TRUNC('month'  , cleaned_date) AS month,
       DATE_TRUNC('week'   , cleaned_date) AS week,
       DATE_TRUNC('day'    , cleaned_date) AS day,
       DATE_TRUNC('hour'   , cleaned_date) AS hour,
       DATE_TRUNC('minute' , cleaned_date) AS minute,
       DATE_TRUNC('second' , cleaned_date) AS second,
       DATE_TRUNC('decade' , cleaned_date) AS decade
  FROM tutorial.sf_crime_incidents_cleandate;

-------------------------------------------------------
SELECT CURRENT_DATE AS date,
       CURRENT_TIME AS time,
       CURRENT_TIME AT TIME ZONE 'PST' AS time_pst, 
       CURRENT_TIMESTAMP AS timestamp,
       LOCALTIME AS localtime,
       LOCALTIMESTAMP AS localtimestamp,
       NOW() AS now,
       NOW() AT TIME ZONE 'PST' AS now_pst
;


-------------------------------------------------------

Window Functions

SELECT 
duration_seconds,
SUM(duration_seconds) OVER (ORDER BY start_time) AS running_total
FROM tutorial.dc_bikeshare_q1_2012


-------------------------------------------------------
SELECT 
start_terminal,
duration_seconds,

ROW_NUMBER() OVER (ORDER BY start_time) AS row, 
ROW_NUMBER() OVER (PARTITION BY start_terminal ORDER BY start_time) AS row_number, 
RANK() OVER (PARTITION BY start_terminal ORDER BY start_time) AS rank, 
DENSE_RANK() OVER (PARTITION BY start_terminal ORDER BY start_time) AS dense_rank,--The DENSE_RANK function is similar to the RANK function however the DENSE_RANK function does not skip any ranks if there is a tie between the ranks of the preceding records.

SUM(duration_seconds) OVER (PARTITION BY start_terminal ORDER BY start_time) AS running_total,
COUNT(duration_seconds) OVER (PARTITION BY start_terminal ORDER BY start_time) AS running_count,
AVG(duration_seconds) OVER (PARTITION BY start_terminal ORDER BY start_time) AS running_avg
(duration_seconds/SUM(duration_seconds) OVER (PARTITION BY start_terminal))*100 AS pct_of_total_time,

NTILE(4) OVER (PARTITION BY start_terminal ORDER BY duration_seconds) AS quartile,
NTILE(100) OVER (PARTITION BY start_terminal ORDER BY duration_seconds) AS percentile,


LAG(duration_seconds, 1) OVER (PARTITION BY start_terminal ORDER BY duration_seconds) AS lag,
LEAD(duration_seconds, 1) OVER (PARTITION BY start_terminal ORDER BY duration_seconds) AS lead,

duration_seconds - LAG(duration_seconds, 1) OVER (PARTITION BY start_terminal ORDER BY duration_seconds) AS difference

FROM tutorial.dc_bikeshare_q1_2012

Other

select 
round(630/60.0,2), 
cast(round(630/60.0,2) as numeric(36,2))
--10.500000   10.50



SELECT CAST(funding_total_usd AS varchar) AS funding_total_usd_string,
       founded_at_clean::varchar AS founded_at_string
       companies.founded_at_clean::timestamp AS time_to_acquisition
FROM tutorial.crunchbase_companies_clean_date