import random
import pandas as pd
import datetime
from dwopt import make_eng, Pg, Lt, Oc
from dwopt.set_up import _TEST_PG_URL, _TEST_LT_URL, _TEST_OC_URL
import sqlalchemy as alc
from sqlalchemy.dialects.oracle import NUMBER
from sqlalchemy.dialects.postgresql import BIGINT
import logging
_logger = logging.getLogger(__file__)
def make_test_df(n=10000):
"""Make a test dataframe with various data types and missing values.
Details see :func:`dwopt.make_test_tbl`.
Parameters
------------
n: int
Number of records.
Returns
--------
pandas.DataFrame
Examples
----------
>>> from dwopt.testing import make_test_df
>>> make_test_df(10000).iloc[0,:]
id 0
score 4.066531
amt 813
cat test
date 2022-01-01
time 2022-03-03 10:19:35.071235
Name: 0, dtype: object
"""
random.seed(0)
df = pd.DataFrame(
{
"id": range(n),
"score": [random.uniform(-1, 5) for i in range(n)],
"amt": random.choices(range(1000), k=n),
"cat": random.choices(["test", "train"], k=n),
"date": [
datetime.date.fromisoformat(i)
for i in random.choices(["2022-01-01", "2022-02-02", "2022-03-03"], k=n)
],
"time": [
datetime.datetime.fromisoformat(i)
for i in random.choices(
[
"2022-01-01 00:19:02.011135",
"2022-02-02 23:00:00.000000",
"2022-03-03 10:19:35.071235",
],
k=n,
)
],
}
)
for col in ["score", "cat", "date", "time"]:
df.loc[random.choices(range(n), k=int(n / 20)), col] = None
return df
def _parse_sch_tbl_nme(sch_tbl_nme):
return Pg._parse_sch_tbl_nme(Pg, sch_tbl_nme)
def _make_pg_tbl(df, eng, sch_tbl_nme):
_, sch, tbl_nme = _parse_sch_tbl_nme(sch_tbl_nme)
meta = alc.MetaData()
test_tbl = alc.Table(
tbl_nme,
meta,
alc.Column("id", BIGINT, primary_key=True),
alc.Column("score", alc.Float(8)),
alc.Column("amt", BIGINT),
alc.Column("cat", alc.String(20)),
alc.Column("date", alc.Date),
alc.Column("time", alc.DateTime),
schema=sch,
)
with eng.connect() as conn:
test_tbl.drop(conn, checkfirst=True)
meta.create_all(eng)
with eng.connect() as conn:
conn.execute(
test_tbl.insert(),
df.assign(
time=lambda x: x.time.astype(object).where(~x.time.isna(), None)
).to_dict("records"),
)
def _make_lt_tbl(df, eng, sch_tbl_nme):
_, sch, tbl_nme = _parse_sch_tbl_nme(sch_tbl_nme)
meta = alc.MetaData()
test_tbl = alc.Table(
tbl_nme,
meta,
alc.Column("id", alc.Integer, primary_key=True),
alc.Column("score", alc.REAL),
alc.Column("amt", alc.Integer),
alc.Column("cat", alc.String),
alc.Column("date", alc.String),
alc.Column("time", alc.String),
schema=sch,
)
with eng.connect() as conn:
test_tbl.drop(conn, checkfirst=True)
meta.create_all(eng)
with eng.connect() as conn:
conn.execute(
test_tbl.insert(),
df.assign(
time=lambda x: x.time.astype(str).where(~x.time.isna(), None)
).to_dict("records"),
)
def _make_oc_tbl(df, eng, sch_tbl_nme):
_, sch, tbl_nme = _parse_sch_tbl_nme(sch_tbl_nme)
meta = alc.MetaData()
test_tbl = alc.Table(
tbl_nme,
meta,
alc.Column("id", NUMBER, primary_key=True),
alc.Column("score", alc.Float),
alc.Column("amt", NUMBER),
alc.Column("cat", alc.String(20)),
alc.Column("date", alc.Date),
alc.Column("time", alc.DateTime),
schema=sch,
)
with eng.connect() as conn:
test_tbl.drop(conn, checkfirst=True)
meta.create_all(eng)
with eng.connect() as conn:
conn.execute(
test_tbl.insert(),
df.assign(
time=lambda x: x.time.astype(object).where(~x.time.isna(), None)
).to_dict("records"),
)
[docs]def make_test_tbl(db, sch_tbl_nme="test", n=10000):
"""Make or remake a test table on database.
Uses Sqlalchemy toolkits for table drop, creation, insertion.
Parameters
------------
db: dwopt.dbo._Db, or str
Dwopt database operator object. Or one of ``'pg'``, ``'lt'``, and ``'oc'``,
indicating usage of pre-defined testing database engines.
sch_tbl_nme: str
Table name in form ``my_schema1.my_table1`` or ``my_table1``.
n: int
Number of records.
Returns
----------
(dwopt.dbo._Db, pandas.DataFrame):
Tuple of database operator used, and the test dataframe.
Notes
------
**Table specifications**
====== ============== =========== ===================
Column Data type None values Example
====== ============== =========== ===================
id int64 0
score float64 NaN 4.066531
amt int64 867
cat str None train
date datetime.date None 2022-03-03
time datetime64[ns] NaT 2022-02-02 23:00:00
====== ============== =========== ===================
**Test database table specifications**
====== =========== ======= ============
Column Postgre Sqlite Oracle
====== =========== ======= ============
id bigint integer number
score float8 real float
amt bigint integer number
cat varchar(20) text varchar2(20)
date date text date
time timestamp text timestamp
====== =========== ======= ============
These datatypes are implemented via respective
`Sqlalchemy datatypes <https://docs.sqlalchemy.org/en/14/core/type_basics.html>`_.
The ``id`` column will be made primary key in the test database tables.
*Datetime types*
The ``time`` column's ``NaT`` objects are converted into None before insertion for
Postgre and Oracle.
The ``time`` column are converted into str and None before insertion for Sqlite.
See :meth:`dwopt.dbo._Db.write` for discussion on
datetime columns and reversibility of insert statements.
**Pre-defined testing database engines**
* ``pg``: ``postgresql://dwopt_tester:1234@localhost/dwopt_test``
* ``lt``: ``sqlite://``
* ``oc``: Not implemented.
**Install testing databases**
*Postgre*::
psql -U postgres
CREATE DATABASE dwopt_test;
CREATE USER dwopt_tester WITH PASSWORD '1234';
GRANT ALL PRIVILEGES ON DATABASE dwopt_test to dwopt_tester;
*Oracle*
* Install oracle db from the
`oracle xe <https://www.oracle.com/database/technologies/xe-downloads.html>`_.
* Schema: test_schema
* Not implemented.
Examples
----------
Make test table through user provided database operator:
>>> from dwopt import lt, make_test_tbl
>>> _ = make_test_tbl(lt)
>>> lt.qry('test').len()
10000
Use the function to make database operator linked to the pre-defined
test databases, then make test table on it:
>>> from dwopt import make_test_tbl
>>> lt, df = make_test_tbl('lt', 'foo', 999)
>>> lt.eng
Engine(sqlite://)
>>> lt.qry('foo').len()
999
>>> len(df)
999
"""
if isinstance(db, str):
if db == "pg":
db = Pg(make_eng(_TEST_PG_URL))
elif db == "lt":
db = Lt(make_eng(_TEST_LT_URL))
elif db == "pg":
db = Oc(make_eng(_TEST_OC_URL))
else:
raise ValueError("Invalid db str, use one of 'pg', 'lt', or 'oc'")
df = make_test_df(n)
if isinstance(db, Pg):
_make_pg_tbl(df, db.eng, sch_tbl_nme)
elif isinstance(db, Lt):
_make_lt_tbl(df, db.eng, sch_tbl_nme)
elif isinstance(db, Oc):
_make_oc_tbl(df, db.eng, sch_tbl_nme)
else:
raise ValueError(
"Invalid db, must be a database operator object, instances of "
"(dwopt.Pg, dwopt.Lt, dwopt.Oc)"
)
return db, df