Đây là một data.table
giải pháp sử dụng foverlaps
để phát hiện các bản ghi chồng chéo (như đã được đề cập bởi @GenesRus). Các bản ghi chồng lấp được gán cho các nhóm để lọc bản ghi với tối đa. ưu tiên trong nhóm. Tôi đã thêm hai bản ghi vào dữ liệu mẫu của bạn, để cho thấy rằng quy trình này cũng đang hoạt động cho ba hoặc nhiều bản ghi chồng chéo:
Chỉnh sửa: Tôi đã sửa đổi và dịch giải pháp @ pgcudahy để data.table
cung cấp mã nhanh hơn:
library(data.table)
library(lubridate)
times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-06 04:53:47"
)
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-06 05:07:12"
)
),
priority = c(5, 3, 4, 3, 4, 5, 6)
)
resultDT <- setDT(times_df, key="start")[!(stop >= shift(start, type="lead", fill = TRUE) & priority > shift(priority, type="lead", fill = TRUE)) &
!(start <= shift(stop, type="lag", fill = FALSE) & priority > shift(priority, type="lag", fill = TRUE))]
# old approach ------------------------------------------------------------
# times_dt <- as.data.table(times_df)
# setkey(times_dt, start, stop)[, index := .I]
# overlaps_dt <- foverlaps(times_dt, times_dt, type = "any", which = TRUE)[xid != yid][, group := fifelse(xid > yid, yes = paste0(yid, "_", xid), no = paste0(xid, "_", yid))]
# overlaps_merged <- merge(times_dt, overlaps_dt, by.x = "index", by.y = "xid")[, .(delete_index = index[priority == max(priority)]), by = "group"]
# result_dt <- times_dt[!unique(overlaps_merged$delete_index)][, index := NULL]
Để biết thêm chi tiết, vui lòng xem ?foverlaps
- Có một số tính năng hữu ích hơn được triển khai để kiểm soát những gì được coi là trùng lặp maxgap
, minoverlap
hoặc type
(bất kỳ, trong, bắt đầu, kết thúc và bằng nhau).
Cập nhật - điểm chuẩn mới
Unit: microseconds
expr min lq mean median uq max neval
Paul 25572.550 26105.2710 30183.930 26514.342 29614.272 153810.600 100
MKa 5100.447 5276.8350 6508.333 5401.275 5832.270 23137.879 100
pgcudahy 3330.243 3474.4345 4284.640 3556.802 3748.203 21241.260 100
ismirsehregal 711.084 913.3475 1144.829 1013.096 1433.427 2316.159 100
Mã điểm chuẩn:
#### library ----
library(dplyr)
library(lubridate)
library(igraph)
library(data.table)
library(microbenchmark)
#### data ----
times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-06 04:53:47"
)
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-06 05:07:12"
)
),
priority = c(5, 3, 4, 3, 4, 5, 6)
)
times_tib <- as_tibble(times_df)
times_dt <- as.data.table(times_df)
#### group_interval function ----
# buffer to take a form similar to: days(1), weeks(2), etc.
group_interval <- function(start, end, buffer = 0) {
dat <- tibble(rid = 1:length(start),
start = start,
end = end,
intervals = case_when(!is.na(start) & !is.na(end) ~ interval(start, end),
is.na(start) ~ interval(end, end),
is.na(end) ~ interval(start, start),
TRUE ~ interval(NA, NA)))
# apply buffer period to intervals
int_start(dat$intervals) <- int_start(dat$intervals) - buffer + seconds(0.01)
int_end(dat$intervals) <- int_end(dat$intervals) + buffer - seconds(0.01)
df_overlap <- bind_cols(
expand.grid(dat$rid, dat$rid), # make a 2 col table with every combination of id numbers
expand.grid(dat$intervals, dat$intervals)) %>% # make a combination of every interval
mutate(overlap = int_overlaps(.data$Var11, .data$Var21)) %>% # determine if intervals overlap
rename("row" = "Var1", "col" = "Var2")
# Find groups via graph theory See igraph package
dat_graph <- graph_from_data_frame(filter(df_overlap, overlap) %>% select(row, col))
groups <- components(dat_graph)$membership[df_overlap$row]
# create a 2 column df with row (index) and group number, arrange on row number and return distinct values
df_groups <- tibble(row = as.integer(names(groups)), group = groups) %>%
unique()
# returns
left_join(select(dat, rid), df_groups, by = c("rid" = "row"))$group
}
#### benchmark ----
library(igraph)
library(data.table)
library(dplyr)
library(lubridate)
library(microbenchmark)
df_Paul <- df_MKa <- df_pgcudahy <- df_ismirsehregal <- times_df <- data.frame(
start = as_datetime(
c(
"2019-10-05 14:05:25",
"2019-10-05 17:30:20",
"2019-10-05 17:37:00",
"2019-10-06 04:43:55",
"2019-10-06 04:53:45",
"2019-10-06 04:53:46",
"2019-10-07 06:00:00",
"2019-10-07 06:10:00",
"2019-10-07 06:20:00",
"2019-10-08 06:00:00",
"2019-10-08 06:10:00",
"2019-10-08 06:20:00",
"2019-10-09 03:00:00",
"2019-10-09 03:10:00",
"2019-10-10 03:00:00",
"2019-10-10 03:10:00",
"2019-10-11 05:00:00",
"2019-10-11 05:00:00")
),
stop = as_datetime(
c(
"2019-10-05 14:19:20",
"2019-10-05 17:45:15",
"2019-10-05 17:50:45",
"2019-10-06 04:59:00",
"2019-10-06 05:07:10",
"2019-10-06 05:07:11",
"2019-10-07 06:18:00",
"2019-10-07 06:28:00",
"2019-10-07 06:38:00",
"2019-10-08 06:18:00",
"2019-10-08 06:28:00",
"2019-10-08 06:38:00",
"2019-10-09 03:30:00",
"2019-10-09 03:20:00",
"2019-10-10 03:30:00",
"2019-10-10 03:20:00",
"2019-10-11 05:40:00",
"2019-10-11 05:40:00")
),
priority = c(5, 3, 4, 3, 4, 5, 4, 3, 4, 3, 4, 3, 1, 2, 2, 1, 3, 4)
)
benchmarks <- microbenchmark(Paul = {
group_interval <- function(start, end, buffer = 0) {
dat <- tibble(rid = 1:length(start),
start = start,
end = end,
intervals = case_when(!is.na(start) & !is.na(end) ~ interval(start, end),
is.na(start) ~ interval(end, end),
is.na(end) ~ interval(start, start),
TRUE ~ interval(NA, NA)))
int_start(dat$intervals) <- int_start(dat$intervals) - buffer + seconds(0.01)
int_end(dat$intervals) <- int_end(dat$intervals) + buffer - seconds(0.01)
df_overlap <- bind_cols(
expand.grid(dat$rid, dat$rid), # make a 2 col table with every combination of id numbers
expand.grid(dat$intervals, dat$intervals)) %>% # make a combination of every interval
mutate(overlap = int_overlaps(.data$Var11, .data$Var21)) %>% # determine if intervals overlap
rename("row" = "Var1", "col" = "Var2")
dat_graph <- graph_from_data_frame(filter(df_overlap, overlap) %>% select(row, col))
groups <- components(dat_graph)$membership[df_overlap$row]
df_groups <- tibble(row = as.integer(names(groups)), group = groups) %>%
unique()
left_join(select(dat, rid), df_groups, by = c("rid" = "row"))$group
}
times_tib <- as_tibble(df_Paul)
mutate(times_tib, group = group_interval(start, stop)) %>%
group_by(group) %>%
top_n(1, desc(priority)) %>%
ungroup() %>%
select(-group)
},
MKa = {
df_MKa$id <- 1:nrow(df_MKa)
# Create consolidated df which we will use to check if stop date is in between start and stop
my_df <- bind_rows(replicate(n = nrow(df_MKa), expr = df_MKa, simplify = FALSE))
my_df$stop_chk <- rep(df_MKa$stop, each = nrow(df_MKa))
# Flag if stop date sits in between start and stop
my_df$chk <- my_df$stop_chk >= my_df$start & my_df$stop_chk <= my_df$stop
my_df$chk_id <- df_MKa[match(my_df$stop_chk, df_MKa$stop), "id"]
# Using igrpah to cluster ids to create unique groups
# this will identify any overlapping groups
library(igraph)
g <- graph.data.frame(my_df[my_df$chk == TRUE, c("id", "chk_id")])
df_g <- data.frame(clusters(g)$membership)
df_g$chk_id <- row.names(df_g)
# copy the unique groups to the df
my_df$new_id <- df_g[match(my_df$chk_id, df_g$chk_id), "clusters.g..membership"]
my_df %>%
filter(chk == TRUE) %>%
arrange(priority) %>%
filter(!duplicated(new_id)) %>%
select(start, stop, priority) %>%
arrange(start)
}, pgcudahy = {
df_pgcudahy %>%
arrange(start) %>%
mutate(remove1 = ifelse((stop >= lead(start, default=FALSE)) &
(priority > lead(priority, default=(max(priority) + 1))), TRUE, FALSE)) %>%
mutate(remove2 = ifelse((start <= lag(stop, default=FALSE)) &
(priority > lag(priority, default=(max(priority) + 1))), TRUE, FALSE)) %>%
filter(remove1 == FALSE & remove2 == FALSE) %>%
select(1:3)
}, ismirsehregal = {
setDT(df_ismirsehregal, key="start")[!(stop >= shift(start, type="lead", fill = TRUE) & priority > shift(priority, type="lead", fill = TRUE)) &
!(start <= shift(stop, type="lag", fill = FALSE) & priority > shift(priority, type="lag", fill = TRUE))]
})
benchmarks
combn
, mặc dù nó có thể tốn kém nếu bạn có nhiều hàng.times_df %>% mutate(interval = interval(start, stop)) %>% {combn(nrow(.), 2, function(x) if (int_overlaps(.$interval[x[1]], .$interval[x[2]])) x[which.min(.$priority[x])], simplify = FALSE)} %>% unlist() %>% {slice(times_df, -.)}