Source code for dwopt.testing

import datetime
import random

import pandas as pd
import sqlalchemy as alc

import dwopt
from dwopt.set_up import _TEST_LT_URL, _TEST_OC_URL, _TEST_PG_URL


def _make_test_df(n=10000):
    random.seed(0)
    df = pd.DataFrame(
        {
            "id": range(n),
            "score": [random.uniform(-1, 5) for i in range(n)],
            "amt": random.choices(range(1000), k=n),
            "cat": random.choices(["test", "train"], k=n),
            "dte": [
                datetime.date.fromisoformat(i)
                for i in random.choices(["2022-01-01", "2022-02-02", "2022-03-03"], k=n)
            ],
            "time": [
                datetime.datetime.fromisoformat(i)
                for i in random.choices(
                    [
                        "2022-01-01 00:19:02.011135",
                        "2022-02-02 23:00:00.000000",
                        "2022-03-03 10:19:35.071235",
                    ],
                    k=n,
                )
            ],
        }
    )

    for col in ["score", "cat", "dte", "time"]:
        df.loc[random.choices(range(n), k=int(n / 20)), col] = None

    return df


def _parse_sch_tbl_nme(sch_tbl_nme):
    db = dwopt.dbo._Db
    return db._parse_sch_tbl_nme(db, sch_tbl_nme)


def _make_pg_tbl(df, eng, sch_tbl_nme):
    _, sch, tbl_nme = _parse_sch_tbl_nme(sch_tbl_nme)
    meta = alc.MetaData()
    test_tbl = alc.Table(
        tbl_nme,
        meta,
        alc.Column("id", alc.dialects.postgresql.BIGINT, primary_key=True),
        alc.Column("score", alc.Float(8)),
        alc.Column("amt", alc.dialects.postgresql.BIGINT),
        alc.Column("cat", alc.String(20)),
        alc.Column("dte", alc.Date),
        alc.Column("time", alc.DateTime),
        schema=sch,
    )
    with eng.begin() as conn:
        test_tbl.drop(conn, checkfirst=True)
    meta.create_all(eng)
    with eng.begin() as conn:
        conn.execute(
            test_tbl.insert(),
            df.assign(
                time=lambda x: x.time.astype(object).where(~x.time.isna(), None)
            ).to_dict("records"),
        )


def _make_lt_tbl(df, eng, sch_tbl_nme):
    _, sch, tbl_nme = _parse_sch_tbl_nme(sch_tbl_nme)
    meta = alc.MetaData()
    test_tbl = alc.Table(
        tbl_nme,
        meta,
        alc.Column("id", alc.Integer, primary_key=True),
        alc.Column("score", alc.REAL),
        alc.Column("amt", alc.Integer),
        alc.Column("cat", alc.String),
        alc.Column("dte", alc.String),
        alc.Column("time", alc.String),
        schema=sch,
    )
    with eng.begin() as conn:
        test_tbl.drop(conn, checkfirst=True)
    meta.create_all(eng)
    with eng.begin() as conn:
        conn.execute(
            test_tbl.insert(),
            df.assign(
                time=lambda x: x.time.astype(str).where(~x.time.isna(), None)
            ).to_dict("records"),
        )


def _make_oc_tbl(df, eng, sch_tbl_nme):
    _, sch, tbl_nme = _parse_sch_tbl_nme(sch_tbl_nme)
    meta = alc.MetaData()
    test_tbl = alc.Table(
        tbl_nme,
        meta,
        alc.Column("id", alc.dialects.oracle.NUMBER, primary_key=True),
        alc.Column("score", alc.Float),
        alc.Column("amt", alc.dialects.oracle.NUMBER),
        alc.Column("cat", alc.String(20)),
        alc.Column("dte", alc.Date),
        alc.Column("time", alc.Date),
        schema=sch,
    )
    with eng.begin() as conn:
        test_tbl.drop(conn, checkfirst=True)
    meta.create_all(eng)
    with eng.begin() as conn:
        conn.execute(
            test_tbl.insert(),
            df.assign(
                score=lambda x: x.score.astype(object).where(~x.score.isna(), None),
                time=lambda x: x.time.astype(object).where(~x.time.isna(), None),
            ).to_dict("records"),
        )


[docs]def make_test_tbl(db, sch_tbl_nme="test", n=10000): """Make or remake a test table on database. Uses Sqlalchemy toolkits for table drop, creation, insertion. Parameters ------------ db: dwopt.dbo._Db, or str Dwopt database operator object. Or one of ``'pg'``, ``'lt'``, and ``'oc'``, indicating usage of pre-defined testing database engines. sch_tbl_nme: str Table name in form ``my_schema1.my_table1`` or ``my_table1``. n: int Number of records. Returns ---------- (dwopt.dbo._Db, pandas.DataFrame): Tuple of database operator used, and the test dataframe. Notes ------ **Table specifications** ====== ============== ============== ==== =================== Column Column type Object type None Example ====== ============== ============== ==== =================== id int64 int64 0 score float64 float64 NaN 4.066531 amt int64 int64 867 cat object str None train dte object datetime.date None 2022-03-03 time datetime64[ns] datetime64[ns] NaT 2022-02-02 23:00:00 ====== ============== ============== ==== =================== **Test database table specifications** ====== =========== ======= ============ Column Postgre Sqlite Oracle ====== =========== ======= ============ id bigint integer number score float8 real float amt bigint integer number cat varchar(20) text varchar2(20) dte date text date time timestamp text date ====== =========== ======= ============ These datatypes are implemented via closest `Sqlalchemy datatypes <https://docs.sqlalchemy.org/en/14/core/type_basics.html>`_. The ``id`` column is made primary key in the test database tables. *Floating point types* The ``score`` column's ``NaN`` objects are converted into ``None`` before insertion for oracle. *Datetime types* The ``time`` column's ``NaT`` objects are converted into ``None`` before insertion for Postgre and Oracle. The ``time`` column are converted into str and None before insertion for Sqlite. **Pre-defined testing database engines** * ``pg``: ``postgresql://dwopt_tester:1234@localhost/dwopt_test`` * ``lt``: ``sqlite://`` * ``oc``: ``oracle://dwopt_test:1234@localhost:1521/?service_name=XEPDB1 &encoding=UTF-8&nencoding=UTF-8``. **Install testing databases** *Postgre* * Install from `postgresql windows <https://www.postgresql.org/download/windows/>`_. * Add postgres bin directory to path. * Run commands: .. code-block:: console psql -U postgres CREATE DATABASE dwopt_test; CREATE USER dwopt_tester WITH PASSWORD '1234'; GRANT ALL PRIVILEGES ON DATABASE dwopt_test to dwopt_tester; *Postgre on linux* .. code-block:: console sudo su postgres psql CREATE DATABASE dwopt_test; CREATE USER dwopt_tester WITH PASSWORD '1234'; GRANT ALL PRIVILEGES ON DATABASE dwopt_test to dwopt_tester; *Oracle* * Install from `oracle xe <https://www.oracle.com/database/technologies/xe-downloads.html>`_. * Set ORACLE_HOME, ORACLE_SID environment variable, and path based on the `oracle configuration <https://docs.oracle.com/database/121/ADMQS/ GUID-EC18C4A6-3BA5-4C14-9D76-B0DD62FEFFF2.htm#ADMQS12369>`_. * Run commands: .. code-block:: console sqlplus sys/{PASSWORD}@//localhost:1521/XEPDB1 as sysdba create user dwopt_test identified by 1234; grant create session to dwopt_test; grant create table to dwopt_test; grant unlimited tablespace to dwopt_test; Examples ---------- Make test table via provided database operator: >>> from dwopt import lt, make_test_tbl >>> _ = make_test_tbl(lt) >>> lt.qry('test').len() 10000 Make database operator for the pre-defined test databases, then make test table on it: >>> from dwopt import make_test_tbl >>> lt, df = make_test_tbl('lt', 'foo', 999) >>> lt.eng Engine(sqlite://) >>> lt.qry('foo').len() 999 >>> len(df) 999 """ if isinstance(db, str): if db == "pg": db = dwopt.db(_TEST_PG_URL) elif db == "lt": db = dwopt.db(_TEST_LT_URL) elif db == "oc": db = dwopt.db(_TEST_OC_URL) else: raise ValueError("Invalid db str, use one of 'pg', 'lt', or 'oc'") df = _make_test_df(n) dlc = db._dialect if dlc == "pg": _make_pg_tbl(df, db.eng, sch_tbl_nme) elif dlc == "lt": _make_lt_tbl(df, db.eng, sch_tbl_nme) elif dlc == "oc": _make_oc_tbl(df, db.eng, sch_tbl_nme) else: raise ValueError( "Invalid db, must be a database operator object, instances of " "(dwopt.Pg, dwopt.Lt, dwopt.Oc)" ) return db, df