auto_detect maxes out at 24 columns with json? #4055

ak--47 · 2024-11-11T01:37:50Z

i have noticed that if i feed a (newline) json file to duckdb's read_json() data import, where each json object has 25 or more unique keys, duckdb just schematizes it as a single json column. it doesn't matter if you have auto_detect on, it won't infer the schema on a JSON object >25 keys (even if the keyspace is consistent across all lines in the JSON file).

this seems like a strange/undocumented limitation (or i am missing something obvious).

here's some code which reproduces the behavior consistently... we generated two datasets, one has json with 24 columns, the other has 25... the one with 24 has the schema neatly inferred, the one with 25 is always single variant json column, like this:

import { Database } from 'duckdb-async';
import fs from 'fs/promises';
import path from 'path';
const TEMP_DIR = path.resolve('./');
const db = await Database.create(path.join(TEMP_DIR, 'duckdb.db'));
const connection = await db.connect();

// create sample data, one with 24 columns and the other with 25 columns
// {col1, col2, ... col24}
// {col1, col2, ... col25}

const dataWithTwentyFourCols = [];
const dataWithTwentyFiveCols = [];

Array.from({ length: 1000 }, (_, outer) => {
	const twnFour = {};
	const twnFive = {};

	Array.from({ length: 24 }, (_, inner) => {
		twnFour[`col_${inner}`] = makeName();
	});

	Array.from({ length: 25 }, (_, inner) => {
		twnFive[`col_${inner}`] = makeName();
	});
	dataWithTwentyFourCols.push(twnFour);
	dataWithTwentyFiveCols.push(twnFive);
});

await fs.writeFile(path.join(TEMP_DIR, 'twenty_four_cols.json'), toNdjson(dataWithTwentyFourCols));
await fs.writeFile(path.join(TEMP_DIR, 'twenty_five_cols.json'), toNdjson(dataWithTwentyFiveCols));


// load both data sets into tables
await connection.all(`
 CREATE OR REPLACE TABLE twenty_four_cols AS
	 SELECT * FROM read_json('${path.join(TEMP_DIR, 'twenty_four_cols.json')}',
	 auto_detect=true
	 )
	`);

await connection.all(`
	 CREATE OR REPLACE TABLE twenty_five_cols AS
	 SELECT * FROM read_json('${path.join(TEMP_DIR, 'twenty_five_cols.json')}',
	 auto_detect=true	 
	 )`);

// notice one table has 24 columns and the other has only 1 columns	


const schemaTwentyFourCols = await connection.all(`DESCRIBE twenty_four_cols`);
// ^ this auto detects the schema, inferring the headers + types for each column 
const schemaTwentyFiveCols = await connection.all(`DESCRIBE twenty_five_cols`);
// ^ this produces a single json column MAP(VARCHAR, VARCHAR) for the 25th column

console.log("\n# of autodected COLUMNS (24 keys)", schemaTwentyFourCols.length);
console.log("# of autodected COLUMNS (25 keys)", schemaTwentyFiveCols.length);
console.log("\n\nSCHEMA (25)", JSON.stringify(schemaTwentyFiveCols, null, 2));
console.log("\n\nwhy can't we force  schema inference on >25 cols? (seems arbitrary)\n\n");

/**
 * helpers
 */

function toNdjson(data) {
	return data.map((row) => JSON.stringify(row)).join('\n');
}

function makeName(words = 3, separator = "-") {
	const adjs = [
		"dark", "grim", "swift", "brave", "bold", "fiery", "arcane",
		"rugged", "calm", "wild", "brisk", "dusty", "mighty", "sly",
		"old", "ghostly", "frosty", "gilded", "murky", "grand", "sly",
		"quick", "cruel", "meek", "glum", "drunk", "slick", "bitter",
		"nimble", "sweet", "tart", "tough"
	];

	const nouns = [
		"mage", "inn", "imp", "bard", "witch", "drake", "knight", "brew",
		"keep", "blade", "beast", "spell", "tome", "crown", "ale", "bard",
		"joke", "maid", "elf", "orc", "throne", "quest", "scroll", "fey",
		"pixie", "troll", "giant", "vamp", "ogre", "cloak", "gem", "axe",
		"armor", "fort", "bow", "lance", "moat", "den"
	];

	const verbs = [
		"cast", "charm", "brawl", "brew", "haunt", "sail", "storm", "quest",
		"joust", "feast", "march", "scheme", "raid", "guard", "duel",
		"trick", "flee", "prowl", "forge", "explore", "vanish", "summon",
		"banish", "bewitch", "sneak", "chase", "ride", "fly", "dream", "dance"
	];

	const adverbs = [
		"boldly", "bravely", "slyly", "wisely", "fiercely", "stealthily", "proudly", "eagerly",
		"quietly", "loudly", "heroically", "craftily", "defiantly", "infamously", "cleverly", "dastardly"
	];

	const continuations = [
		"and", "of", "in", "on", "under", "over", "beyond", "within", "while", "during", "after", "before",
		"beneath", "beside", "betwixt", "betwain", "because", "despite", "although", "however", "nevertheless"
	];

	let string;
	const cycle = [adjs, nouns, verbs, adverbs, continuations];
	for (let i = 0; i < words; i++) {
		const index = i % cycle.length;
		const word = cycle[index][Math.floor(Math.random() * cycle[index].length)];
		if (!string) {
			string = word;
		} else {
			string += separator + word;
		}
	}

	return string;
};

is there some way to change how this works, or force duckdb to do schema inference? is i'd even be happy if i could get a view with the schema inferred (my data has hundreds of columns and i don't know which ones i need at the time i load the data...)

The text was updated successfully, but these errors were encountered:

Mytherin · 2024-11-12T14:57:18Z

Thanks for the report! This is related to the map_inference_threshold parameter - which does not appear to be documented yet currently. This defaults to 25, and can be configured in the read_json function (e.g. read_json('file.json', map_inference_threshold=50). This can be disabled entirely by using map_inference_threshold=-1. See duckdb/duckdb#11285 for the PR that added this functionality.

ak--47 · 2024-11-13T03:34:09Z

@Mytherin thank you !!!!! exactly what i was looking for...

Mytherin · 2024-11-13T07:53:36Z

Let's keep the issue open so we can remember to document it :)

updating docs to include `map_inferance_threshold` pr: duckdb/duckdb#11285 issue: duckdb#4055 discord: https://discord.com/channels/909674491309850675/921125471905787944/1305689816020549754

ak--47 · 2024-11-14T14:16:55Z

@Mytherin here's a PR which updates the docs:
#4065

mostly pulled language from the Ziya's PR.

Mytherin transferred this issue from duckdb/duckdb-node Nov 12, 2024

ak--47 closed this as completed Nov 13, 2024

Mytherin reopened this Nov 13, 2024

ak--47 mentioned this issue Nov 14, 2024

add map_inference_threshold docs #4065

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

auto_detect maxes out at 24 columns with json? #4055

auto_detect maxes out at 24 columns with json? #4055

ak--47 commented Nov 11, 2024 •

edited

Loading

Mytherin commented Nov 12, 2024 •

edited

Loading

ak--47 commented Nov 13, 2024

Mytherin commented Nov 13, 2024

ak--47 commented Nov 14, 2024

auto_detect maxes out at 24 columns with json? #4055

auto_detect maxes out at 24 columns with json? #4055

Comments

ak--47 commented Nov 11, 2024 • edited Loading

Mytherin commented Nov 12, 2024 • edited Loading

ak--47 commented Nov 13, 2024

Mytherin commented Nov 13, 2024

ak--47 commented Nov 14, 2024

ak--47 commented Nov 11, 2024 •

edited

Loading

Mytherin commented Nov 12, 2024 •

edited

Loading