Code snippets

Small reusable snippets I reach for when working with data, analytics, modelling and web projects.

Docker

Basic Dockerfile for Python

FROM python:3.11-slim
WORKDIR /app
COPY pyproject.toml poetry.lock* /app/
RUN pip install --upgrade pip
COPY . /app
CMD ["python", "src/main.py"]

Docker Compose service

version: "3.8"
services:
    web:
        build: .
        ports:
            - "8000:8000"
        volumes:
            - .:/app

Start shell in running container

docker exec -it <container_name> bash

Start shell from image

docker run -it --entrypoint /bin/bash <image>

Image layer history

docker image history <image_name>

Execute Python inside container

docker exec <container_name> python <script_name> <arg1> <arg2>

Ubuntu + Python + GHC Dockerfile

FROM ubuntu:latest
 
# Update the respository source list
RUN apt-get update &&     apt-get upgrade -y &&     apt-get install -y  software-properties-common &&     apt-add-repository -y "ppa:hvr/ghc" &&    apt-get update

#Install Python
RUN apt-get update && apt-get install -y python3.6    python3-pip openjdk-8-jdk

#Install GHC and Cabal(Haskell bases)
RUN apt-get -yq update && apt-get -yq --no-install-suggests --no-install-recommends install     cabal-install-2.0     cabal-install-head     ghc-8.2.2     alex-3.1.7     happy-1.19.5   && rm -rf /var/lib/apt/lists/*

#Add these to our bin
ENV PATH=$HOME/.local/bin:/opt/ghc/8.2.2/bin:/opt/cabal/2.0/bin:/opt/happy/1.19.5/bin:/opt/alex/3.1.7/bin:$PATH

#Install Haskell libraries
RUN cabal update
RUN cabal install csv
RUN cabal install parsec
RUN cabal install text


# create and make our working directory
WORKDIR /app

#Copy current directory to the domino directory we made
COPY . /app

#Install Python Packages from requirements txt
RUN pip3 install -r requirements.txt

Python

Core Python

Read environment variable

import os

def get_env(name, default=None):
        return os.getenv(name, default)

Simple timing context manager

import time
from contextlib import contextmanager

@contextmanager
def timeit(label=None):
        start = time.time()
        yield
        elapsed = time.time() - start
        print(f"{label or 'elapsed'}: {elapsed:.3f}s")

pandas

Read CSV and inspect

import pandas as pd

df = pd.read_csv('data.csv', parse_dates=['date'])
print(df.shape)
print(df.head())

Group by summary

summary = df.groupby('category').agg({'value': ['mean','sum','count']})
print(summary)

Date parsing

df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')

Conditional column (np.where)

import numpy as np
import pandas as pd

# generate a pandas dataframe of 100 random integers
df = pd.DataFrame(np.random.randint(low=0, high=100, size=(100,1)), columns=['col1'])

# Generate a conditional column that checks if the number is even or odd
df['odd/even'] = np.where(df['col1'] % 2 == 1, 'odd', 'even')

Applying function to column (apply)

import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.randint(low=0, high=100, size=(100,1)), columns=['col1'])

def multiply_by_2(number):
    val = number * 2
    return val

df['col2'] = df['col1'].apply(multiply_by_2)

matplotlib

Basic line chart

import matplotlib.pyplot as plt

plt.plot(df.index, df['value'])
plt.title('Value over time')
plt.xlabel('date')
plt.ylabel('value')
plt.show()

Save figure

fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot([1,2,3],[4,5,6])
fig.savefig('plot.png')

scikit learn

Train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Simple pipeline

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train)

stats and experimentation

Two proportion z test (placeholder)

# Placeholder: use statsmodels or scipy for two-proportion z-test
from statsmodels.stats.proportion import proportions_ztest

count = [success_a, success_b]
nobs = [n_a, n_b]
stat, pval = proportions_ztest(count, nobs)

Sample ratio mismatch note

# Quick check for sample ratio mismatch
expected_ratio = 0.5
observed = n_a / (n_a + n_b)
print(observed, expected_ratio)

SQL

Group by month

SELECT DATE_TRUNC('month', created_at) AS month, COUNT(*) AS events
FROM events
GROUP BY month
ORDER BY month DESC;

Window function with row_number

SELECT *, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY created_at DESC) as rn
FROM events;

Lag example

SELECT *, LAG(value) OVER (PARTITION BY id ORDER BY ts) as prev_value
FROM metrics;

Select first_name from customer

SELECT first_name FROM customer;

Haskell

Simple function

add :: Int -> Int -> Int
add x y = x + y

Pattern matching

fact :: Int -> Int
fact 0 = 1
fact n = n * fact (n - 1)

Main example (let/do)

main :: IO ()
main = do
  let
    printString = "Hi"
    number = 3
  putStrLn printString
  (putStrLn . show) number

Scala

Case class

case class User(id: Int, name: String)

Map and filter example

val nums = List(1,2,3,4,5)
val evens = nums.filter(_ % 2 == 0).map(_ * 2)

File exists

new java.io.File(file_path).isFile

Split list given separator

import scala.collection.mutable.ListBuffer  
def listSplit[T](collection:Seq[T],seperator:T):Seq[Seq[T]]= {
  val name = ListBuffer(ListBuffer[T]())
  collection foreach {e =>
    if(e == seperator){
      name += ListBuffer[T]()
    } else {
      name.last += e
    }
  }
  name.map(_.toSeq).toSeq
}