TileDB Dimensions: Date and Datetime Support

Overview

TileDB supports a wide range of date and time types as shown in the documention for Datetimes.

The supported range of dates and times correspond to the dates and times supported by the Python extension numpy, and is described in its documentation.

import numpy as np
import datetime

day = "1971-02-03 04:05:06"    # third day of second month of first year past epoch

[np.datetime64(day, 'Y').astype('int64'),    # extract year, month, ... as an int64
np.datetime64(day, 'M').astype('int64'),
np.datetime64(day, 'D').astype('int64'),
np.datetime64(day, 'h').astype('int64'),
np.datetime64(day, 'm').astype('int64'),
np.datetime64(day, 's').astype('int64')]
## [1, 13, 398, 9556, 573365, 34401906]

R has date and datetime support built-in. The Date supports dates using an integer count since the epoch, and matches the D value from the previous example. Similarly, the ‘compact’ POSIXct representation of a Datetime uses the number of seconds since the epoch and corresponds to the s value from the previous example.

as.Date(398, origin="1970-01-01")  # conversion from numeric input requires 'origin'
## [1] "1971-02-03"
as.POSIXct(34401906, origin="1970-01-01", tz="UTC", usetz=TRUE)
## [1] "1971-02-03 04:05:06 UTC"

R can reconstruct dates and times from the numpy representation using the epoch as a ‘base’ date along with time period calculations. This can be done using base R (adding to Date or POSIXct objects), by using the lubridate package for a number of intermediate formats and by using the nanotime package for higher-resolution periods and intervals.

suppressMessages(library(lubridate))
ymd("1970-01-01") +  c(years(1), months(13), days(398))
## [1] "1971-01-01" "1971-02-01" "1971-02-03"
ymd_hms("1970-01-01 00:00:00") + c(hours(9556), minutes(573365))
## [1] "1971-02-03 04:00:00 UTC" "1971-02-03 04:05:00 UTC"

suppressMessages(library(nanotime))
nanotime("1970-01-01T00:00:00+00:00") + nanoduration(hours=2, minutes=3,
                                                     seconds=4, nanoseconds=5)
## [1] 1970-01-01T02:03:04.000000005+00:00

Python and R Interoperability

Coarsest: Year

Python

import numpy as np
import sys
import os
import tiledb
uri = "/tmp/tiledb/dt_year"
dom = tiledb.Domain(tiledb.Dim(name="rows",
                               domain=(np.datetime64('2001-01-01'), np.datetime64('2030-12-31')),
                               tile=np.timedelta64(10, 'Y'),
                               dtype=np.datetime64('', 'Y')))
schema = tiledb.ArraySchema(domain=dom,
                            sparse=True,
                            attrs=[tiledb.Attr(name="a", dtype=np.int32)])
if (os.path.isdir(uri)):
    tiledb.VFS().remove_dir(uri)
tiledb.SparseArray.create(uri, schema)

with tiledb.SparseArray(uri, mode='w') as A:
    I = [np.datetime64('2001-01-01'),np.datetime64('2002-01-01'),np.datetime64('2003-01-01')]
    data = np.array(([1,2,3]))
    A[I] = data

R

library(tiledb)
uri <- "/tmp/tiledb/dt_year"
arr <- tiledb_array(uri, as.data.frame=TRUE)
arr[]
##         rows a
## 1 2001-01-01 1
## 2 2002-01-01 2
## 3 2003-01-01 3
## we can also look at 'raw' int64 values:
datetimes_as_int64(arr) <- TRUE
arr[]
##   rows a
## 1   31 1
## 2   32 2
## 3   33 3

Day

Python

import numpy as np
import sys
import os
import tiledb
uri = "/tmp/tiledb/dt_day"
dom = tiledb.Domain(tiledb.Dim(name="rows",
                               domain=(np.datetime64('2001-01-01'), np.datetime64('2030-12-31')),
                               tile=np.timedelta64(10, 'D'),
                               dtype=np.datetime64('', 'D')))
schema = tiledb.ArraySchema(domain=dom,
                            sparse=True,
                            attrs=[tiledb.Attr(name="a", dtype=np.int32)])
if (os.path.isdir(uri)):
    tiledb.VFS().remove_dir(uri)
tiledb.SparseArray.create(uri, schema)

with tiledb.SparseArray(uri, mode='w') as A:
    I = [np.datetime64('2001-01-01'),np.datetime64('2001-01-02'),np.datetime64('2001-01-03')]
    data = np.array(([1,2,3]))
    A[I] = data

R

library(tiledb)
uri <- "/tmp/tiledb/dt_day"
arr <- tiledb_array(uri, as.data.frame=TRUE)
arr[]
##         rows a
## 1 2001-01-01 1
## 2 2001-01-02 2
## 3 2001-01-03 3

Minute

Python

import numpy as np
import sys
import os
import tiledb
uri = "/tmp/tiledb/dt_min"
dom = tiledb.Domain(tiledb.Dim(name="rows",
                               domain=(np.datetime64('2001-01-01'), np.datetime64('2030-12-31')),
                               tile=np.timedelta64(10, 'm'),
                               dtype=np.datetime64('', 'm')))
schema = tiledb.ArraySchema(domain=dom,
                            sparse=True,
                            attrs=[tiledb.Attr(name="a", dtype=np.int32)])
if (os.path.isdir(uri)):
    tiledb.VFS().remove_dir(uri)
tiledb.SparseArray.create(uri, schema)

with tiledb.SparseArray(uri, mode='w') as A:
    I = [np.datetime64('2001-01-01 00:00'),
         np.datetime64('2001-01-02 00:01'),
         np.datetime64('2001-01-03 00:02')]
    data = np.array(([1,2,3]))
    A[I] = data

R

library(tiledb)
uri <- "/tmp/tiledb/dt_min"
arr <- tiledb_array(uri, as.data.frame=TRUE)
arr[]
##                  rows a
## 1 2001-01-01 00:00:00 1
## 2 2001-01-02 00:01:00 2
## 3 2001-01-03 00:02:00 3

Millisecond

Python

import numpy as np
import sys
import os
import tiledb
uri = "/tmp/tiledb/dt_ms"
dom = tiledb.Domain(tiledb.Dim(name="rows",
                               domain=(np.datetime64('1969-01-01'), np.datetime64('2030-12-31')),
                               tile=np.timedelta64(10, 'ms'),
                               dtype=np.datetime64('', 'ms')))
schema = tiledb.ArraySchema(domain=dom,
                            sparse=True,
                            attrs=[tiledb.Attr(name="a", dtype=np.int32)])
if (os.path.isdir(uri)):
    tiledb.VFS().remove_dir(uri)
tiledb.SparseArray.create(uri, schema)

with tiledb.SparseArray(uri, mode='w') as A:
    I = [np.datetime64('1970-01-01 00:00:00.001'),
         np.datetime64('1980-01-01 00:00:00.002'),
         np.datetime64('1990-01-01 00:00:00.003'),
         np.datetime64('2000-01-01 00:00:00.004'),
         np.datetime64('2010-01-01 00:00:00.005'),
         np.datetime64('2020-01-01 00:00:00.006')]
    data = np.array(([1,2,3,4,5,6]))
    A[I] = data

R

library(tiledb)
uri <- "/tmp/tiledb/dt_ms"
arr <- tiledb_array(uri, as.data.frame=TRUE)
arr[]
##                      rows a
## 1 1970-01-01 00:00:00.001 1
## 2 1980-01-01 00:00:00.002 2
## 3 1990-01-01 00:00:00.003 3
## 4 2000-01-01 00:00:00.004 4
## 5 2010-01-01 00:00:00.005 5
## 6 2020-01-01 00:00:00.006 6

Microsecond

Python

import numpy as np
import sys
import os
import tiledb
uri = "/tmp/tiledb/dt_us"
dom = tiledb.Domain(tiledb.Dim(name="rows",
                               domain=(np.datetime64('1969-01-01'), np.datetime64('2030-12-31')),
                               tile=np.timedelta64(10, 'us'),
                               dtype=np.datetime64('', 'us')))
schema = tiledb.ArraySchema(domain=dom,
                            sparse=True,
                            attrs=[tiledb.Attr(name="a", dtype=np.int32)])
if (os.path.isdir(uri)):
    tiledb.VFS().remove_dir(uri)
tiledb.SparseArray.create(uri, schema)

with tiledb.SparseArray(uri, mode='w') as A:
    I = [np.datetime64('1970-01-01 00:00:00.000001'),
         np.datetime64('1980-01-01 00:00:00.000002'),
         np.datetime64('1990-01-01 00:00:00.000003'),
         np.datetime64('2000-01-01 00:00:00.000004'),
         np.datetime64('2010-01-01 00:00:00.000005'),
         np.datetime64('2020-01-01 00:00:00.000006')]
    data = np.array(([1,2,3,4,5,6]))
    A[I] = data

R

library(tiledb)
uri <- "/tmp/tiledb/dt_us"
arr <- tiledb_array(uri, as.data.frame=TRUE)
arr[]
##                         rows a
## 1 1970-01-01 00:00:00.000001 1
## 2 1980-01-01 00:00:00.000001 2
## 3 1990-01-01 00:00:00.000002 3
## 4 2000-01-01 00:00:00.000003 4
## 5 2010-01-01 00:00:00.000005 5
## 6 2020-01-01 00:00:00.000005 6

Nanosecond

Python

import numpy as np
import sys
import os
import tiledb
uri = "/tmp/tiledb/dt_ns"
dom = tiledb.Domain(tiledb.Dim(name="rows",
                               domain=(np.datetime64('1969-01-01'), np.datetime64('2030-12-31')),
                               tile=np.timedelta64(10, 'ns'),
                               dtype=np.datetime64('', 'ns')))
schema = tiledb.ArraySchema(domain=dom,
                            sparse=True,
                            attrs=[tiledb.Attr(name="a", dtype=np.int32)])
if (os.path.isdir(uri)):
    tiledb.VFS().remove_dir(uri)
tiledb.SparseArray.create(uri, schema)

with tiledb.SparseArray(uri, mode='w') as A:
    I = [np.datetime64('1970-01-01 00:00:00.000000001'),
         np.datetime64('1980-01-01 00:00:00.000000002'),
         np.datetime64('1990-01-01 00:00:00.000000003'),
         np.datetime64('2000-01-01 00:00:00.000000004'),
         np.datetime64('2010-01-01 00:00:00.000000005'),
         np.datetime64('2020-01-01 00:00:00.000000006')]
    data = np.array(([1,2,3,4,5,6]))
    A[I] = data

R

library(tiledb)
uri <- "/tmp/tiledb/dt_ns"
arr <- tiledb_array(uri, as.data.frame=TRUE)
arr[]
##                                  rows a
## 1 1970-01-01T00:00:00.000000001+00:00 1
## 2 1980-01-01T00:00:00.000000002+00:00 2
## 3 1990-01-01T00:00:00.000000003+00:00 3
## 4 2000-01-01T00:00:00.000000004+00:00 4
## 5 2010-01-01T00:00:00.000000005+00:00 5
## 6 2020-01-01T00:00:00.000000006+00:00 6

Use integer64 Directly

Sometimes we may want to access the date or datetimes value in their native integer64 format. To do so, we set a toggle when opening the array as shown in the following example which uses the array from the preceding example (at resolution of nanosecond).

library(tiledb)
uri <- "/tmp/tiledb/dt_ns"
arr <- tiledb_array(uri, as.data.frame=TRUE, datetimes_as_int64=TRUE)
arr[]
##                  rows a
## 1                   1 1
## 2  315532800000000002 2
## 3  631152000000000003 3
## 4  946684800000000004 4
## 5 1262304000000000005 5
## 6 1577836800000000006 6

We can also write integer64 types. The following example adds two extra rows:

library(tiledb)
uri <- "/tmp/tiledb/dt_ns"
arr <- tiledb_array(uri, as.data.frame=TRUE, datetimes_as_int64=TRUE)
arr[] <- data.frame( rows=bit64::as.integer64(2:3), a=102:103)
arr[]
##                  rows   a
## 1                   1   1
## 2                   2 102
## 3                   3 103
## 4  315532800000000002   2
## 5  631152000000000003   3
## 6  946684800000000004   4
## 7 1262304000000000005   5
## 8 1577836800000000006   6