diff --git a/R/data.table.R b/R/data.table.R
index 9eb294c41..02ea2a694 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -517,13 +517,22 @@ replace_dot_alias = function(e) {
         if (!byjoin || nqbyjoin) {
           # Really, `anyDuplicated` in base is AWESOME!
           # allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates
-          if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
-          irows = if (allLen1) f__ else vecseq(f__,len__,
-            if (allow.cartesian ||
-                notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
-                !anyDuplicated(f__, incomparables = c(0L, NA_integer_))) {
-              NULL # #742. If 'i' has no duplicates, ignore
-            } else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
+          if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
+          irows = if (allLen1) f__ else {
+            join.many = getOption("datatable.join.many") # #914, default TRUE for backward compatibility
+            anyDups = if (!join.many && length(f__)==1L && len__==nrow(x)) {
+              NULL # special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
+            } else if (!notjoin && ( # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
+              !allow.cartesian ||
+              !join.many))
+              as.logical(anyDuplicated(f__, incomparables = c(0L, NA_integer_)))
+            limit = if (!is.null(anyDups) && anyDups) { # #742. If 'i' has no duplicates, ignore
+              if (!join.many) stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
+              else if (!allow.cartesian && !notjoin) as.double(nrow(x)+nrow(i))
+              else internal_error("checking allow.cartesian and join.many, unexpected else branch reached") # nocov
+            }
+            vecseq(f__, len__, limit)
+          } # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
           if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
           # Fix for #1092 and #1074
           # TODO: implement better version of "any"/"all"/"which" to avoid
diff --git a/R/mergelist.R b/R/mergelist.R
index 9606ce0ab..f33cdcf99 100644
--- a/R/mergelist.R
+++ b/R/mergelist.R
@@ -7,3 +7,123 @@ cbindlist = function(l, copy=TRUE) {
   setDT(ans)
   ans
 }
+
+# when 'on' is missing then use keys, used only for inner and full join
+onkeys = function(x, y) {
+  if (is.null(x) && !is.null(y)) y
+  else if (!is.null(x) && is.null(y)) x
+  else if (!is.null(x) && !is.null(y)) {
+    if (length(x)>=length(y)) intersect(y, x) ## align order to shorter|rhs key
+    else intersect(x, y)
+  } else NULL # nocov ## internal error is being called later in mergepair
+}
+someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) {
+  keep = colnamesInt(x, keep)
+  drop = colnamesInt(x, drop)
+  cols = colnamesInt(x, cols)
+  ans = union(keep, setdiff(cols, drop))
+  if (!retain.order) return(ans)
+  intersect(colnamesInt(x, NULL), ans)
+}
+hasindex = function(x, by, retGrp=FALSE) {
+  index = attr(x, "index", TRUE)
+  if (is.null(index)) return(FALSE)
+  idx_name = paste0("__",by,collapse="")
+  idx = attr(index, idx_name, TRUE)
+  if (is.null(idx)) return(FALSE)
+  if (!retGrp) return(TRUE)
+  return(!is.null(attr(idx, "starts", TRUE)))
+}
+
+# fdistinct applies mult='first|last'
+# for mult='first' it is unique(x, by=on)[, c(on, cols), with=FALSE]
+# it may not copy when copy=FALSE and x is unique by 'on'
+fdistinct = function(x, on=key(x), mult=c("first","last"), cols=seq_along(x), copy=TRUE) {
+  if (!perhaps.data.table(x))
+    stopf("'x' must be data.table")
+  if (!is.character(on) || !length(on) || anyNA(on) || !all(on %chin% names(x)))
+    stopf("'on' must be character column names of 'x' argument")
+  mult = match.arg(mult)
+  if (is.null(cols))
+    cols = seq_along(x)
+  else if (!(is.character(cols) || is.integer(cols)) || !length(cols) || anyNA(cols))
+    stopf("'cols' must be non-zero length, non-NA, integer or character columns of 'x' argument")
+  if (!isTRUEorFALSE(copy))
+    stopf("'%s' must be TRUE or FALSE", "copy")
+  ## do not compute sort=F for mult="first" if index (sort=T) already available, sort=T is needed only for mult="last"
+  ## this short circuit will work after #4386 because it requires retGrp=T
+  #### sort = mult!="first" || hasindex(x, by=on, retGrp=TRUE)
+  sort = TRUE ## above line does not work for the moment, test 302.02
+  o = forderv(x, by=on, sort=sort, retGrp=TRUE)
+  if (attr(o, "maxgrpn", TRUE) <= 1L) {
+    ans = .shallow(x, someCols(x, cols, keep=on), retain.key=TRUE)
+    if (copy) ans = copy(ans)
+    return(ans)
+  }
+  f = attr(o, "starts", exact=TRUE)
+  if (mult=="last") {
+    if (!sort) internal_error("sort must be TRUE when computing mult='last'") # nocov
+    f = c(f[-1L]-1L, nrow(x)) ## last of each group
+  }
+  if (length(o)) f = o[f]
+  if (sort && length(o <- forderv(f))) f = f[o] ## this rolls back to original order
+  .Call(CsubsetDT, x, f, someCols(x, cols, keep=on))
+}
+
+# extra layer over bmerge to provide ready to use row indices (or NULL for 1:nrow)
+# NULL to avoid extra copies in downstream code, it turned out that avoiding copies precisely is costly and enormously complicates code, need #4409 and/or handle 1:nrow in subsetDT
+dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) {
+  nomatch = switch(how, "inner"=, "semi"=, "anti"=, "cross"= 0L, "left"=, "right"=, "full"= NA_integer_)
+  nomatch0 = identical(nomatch, 0L)
+  if (is.null(mult))
+    mult = switch(how, "semi"=, "anti"= "last", "cross"= "all", "inner"=, "left"=, "right"=, "full"= "error")
+  if (void && mult!="error")
+    internal_error("void must be used with mult='error'") # nocov
+  if (how=="cross") { ## short-circuit bmerge results only for cross join
+    if (length(on) || mult!="all" || !join.many)
+      stopf("cross join must be used with zero-length on, mult='all', join.many=TRUE")
+    if (void)
+      internal_error("cross join must be used with void=FALSE") # nocov
+    ans = list(allLen1=FALSE, starts=rep.int(1L, nrow(i)), lens=rep.int(nrow(x), nrow(i)), xo=integer())
+  } else {
+    if (!length(on))
+      stopf("'on' must be non-zero length character vector")
+    if (mult=="all" && (how=="semi" || how=="anti"))
+      stopf("semi and anti joins must be used with mult!='all'")
+    icols = colnamesInt(i, on, check_dups=TRUE)
+    xcols = colnamesInt(x, on, check_dups=TRUE)
+    ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose)
+    if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line
+      return(invisible(NULL))
+    } else if (how=="semi" || how=="anti") { ## semi and anti short-circuit
+      irows = which(if (how=="semi") ans$lens!=0L else ans$lens==0L) ## we will subset i rather than x, thus assign to irows, not to xrows
+      if (length(irows)==length(ans$lens)) irows = NULL
+      return(list(ans=ans, irows=irows))
+    } else if (mult=="all" && !ans$allLen1 && !join.many && ## join.many, like allow.cartesian, check
+      !(length(ans$starts)==1L && ans$lens==nrow(x)) && ## special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
+      anyDuplicated(ans$starts, incomparables=c(0L,NA_integer_))
+    )
+      stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
+  }
+
+  ## xrows, join-to
+  xrows = if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL)
+  if (nomatch0 && ans$allLen1) xrows = xrows[as.logical(ans$lens)]
+  len.x = length(xrows) ## as of now cannot optimize to NULL, search for #4409 here
+
+  ## irows, join-from
+  irows = if (!(ans$allLen1 && (!nomatch0 || len.x==length(ans$starts)))) seqexp(ans$lens)
+  len.i = if (is.null(irows)) nrow(i) else length(irows)
+
+  if (length(ans$xo) && length(xrows))
+    xrows = ans$xo[xrows]
+  len.x = length(xrows)
+
+  if (len.i!=len.x)
+    internal_error("dtmerge out len.i != len.x") # nocov
+
+  return(list(ans=ans, irows=irows, xrows=xrows))
+}
+
+seqexp = function(x) .Call(Cseqexp, x)
+perhaps.data.table = function(x) .Call(CperhapsDataTableR, x)
diff --git a/src/init.c b/src/init.c
index e61c9b85e..8dd8277f7 100644
--- a/src/init.c
+++ b/src/init.c
@@ -151,6 +151,7 @@ R_CallMethodDef callMethods[] = {
 {"CconvertDate", (DL_FUNC)&convertDate, -1},
 {"Cnotchin", (DL_FUNC)&notchin, -1},
 {"Ccbindlist", (DL_FUNC) &cbindlist, -1},
+{"CperhapsDataTableR", (DL_FUNC) &perhapsDataTableR, -1},
 {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1},
 {NULL, NULL, 0}
 };