When importing a flat file provided by a client into a SQL table the data in the file is often a bit of a mess and it can be easier to import all the columns as long strings into a loading table and to then try and figurre out the formats and datatypes from there.

To that end below is a stored procedure for analysing the columns in a specified table. This provides min and max column lengths as well as if the column has any nulls and the number of populated and distinct values.

CREATE procedure
	[dbo].[sp__inspect_table]
	(
	@schema_name varchar(100),
	@table_name varchar(100)
	) as
IF OBJECT_ID(N'tempdb..#sql_schema_table') IS NOT NULL
BEGIN
DROP TABLE #sql_schema_table
END

declare @sql varchar(max)
declare
	@sql_schema_query_table table
	(
	id int identity(1,1),
	query varchar(max)
	)

insert into
	@sql_schema_query_table
	(
	query
	)
select
	'insert into
		#sql_schema_table
		(
		schema_name,
		table_name,
		column_name,
		datatype,
		min_length,
		max_length,
		min_value,
		max_value,
		total_rows,
		populated_values,
		distinct_values,
		nullable,
		examples
		)
	select
		''[' + @schema_name + ']'' as schema_name,
		''[' + @table_name + ']'' as table_name,
		cast(''[' + a.name + ']'' as varchar(100)) as column_name,
		cast(''[' + d.name + ']'' as varchar(100)) as datatype,
		min(len([' + a.name + '])) as min_length,
		max(len([' + a.name + '])) as max_length,
		min([' + a.name + ']) as min_value,
		max([' + a.name + ']) as max_value,
		count(*) as total_rows,
		sum(case when [' + a.name + '] is null then 0 else 1 end) as populated_values,
		count(distinct [' + a.name + ']) as distinct_values,
		cast(sum(case when [' + a.name + '] is null then 1 else 0 end) as bit) as nullable,
		null as examples
	from
		[' + @schema_name + '].[' + @table_name + ']'
from
	sys.columns a
		join
			sys.objects b
				on
					a.object_id = b.object_id
		join
			sys.schemas c
				on
					b.schema_id = c.schema_id
		join
			sys.types d
				on
					a.system_type_id = d.system_type_id
where
	c.name = @schema_name
	and b.name = @table_name
	and b.type = 'U'--tables only
	and d.name not in ('varchar', 'nvarchar', 'bit')

insert into
	@sql_schema_query_table
	(
	query
	)
select
	'insert into
		#sql_schema_table
		(
		schema_name,
		table_name,
		column_name,
		datatype,
		min_length,
		max_length,
		min_value,
		max_value,
		total_rows,
		populated_values,
		distinct_values,
		nullable,
		examples
		)
	select
		''[' + @schema_name + ']'' as schema_name,
		''[' + @table_name + ']'' as table_name,
		cast(''[' + a.name + ']'' as varchar(100)) as column_name,
		cast(''[' + d.name + ']'' as varchar(100)) as datatype,
		min(len([' + a.name + '])) as min_length,
		max(len([' + a.name + '])) as max_length,
		null as min_value,
		null as max_value,
		count(*) as total_rows,
		sum(case when [' + a.name + '] is null then 0 else 1 end) as populated_values,
		count(distinct [' + a.name + ']) as distinct_values,
		cast(sum(case when [' + a.name + '] is null then 1 else 0 end) as bit) as nullable,
		(select top 5 ISNULL(cast([' + a.name + '] as varchar(100)), ''NULL'') as ''[' + a.name + ']'', count(*) as total from [' + @schema_name + '].[' + @table_name + '] group by [' + a.name + '] order by count(*) desc for json auto) as examples
	from
		[' + @schema_name + '].[' + @table_name + ']'
from
	sys.columns a
		join
			sys.objects b
				on
					a.object_id = b.object_id
		join
			sys.schemas c
				on
					b.schema_id = c.schema_id
		join
			sys.types d
				on
					a.system_type_id = d.system_type_id
where
	c.name = @schema_name
	and b.name = @table_name
	and b.type = 'U'--tables only
	and d.name in ('varchar', 'nvarchar', 'bit')

declare @sql_schema_query_id int
set @sql_schema_query_id = (select min(id) from @sql_schema_query_table)

create table
	#sql_schema_table
	(
	id int identity(1,1),
	schema_name varchar(100),
	table_name varchar(100),
	column_name varchar(100),
	datatype varchar(100),
	min_length int,
	max_length int,
	min_value varchar(100),
	max_value varchar(100),
	distinct_values int,
	total_rows int,
	populated_values int,
	nullable bit,
	examples varchar(2000)
	)

while (@sql_schema_query_id is not null)
BEGIN
	set @sql = (select query from @sql_schema_query_table where id = @sql_schema_query_id)
	exec (@sql)
	delete from @sql_schema_query_table where id = @sql_schema_query_id
	set @sql_schema_query_id = (select min(id) from @sql_schema_query_table)
END

select
	schema_name,
	table_name,
	column_name,
	datatype,
	min_length,
	max_length,
	min_value,
	max_value,
	total_rows,
	distinct_values,
	(distinct_values * 1.0) / (total_rows * 1.0) as distinct_value_percentage,
	populated_values,
	(populated_values * 1.0) / (total_rows * 1.0) as populated_value_percentage,
	nullable,
	examples
from
	#sql_schema_table
order by
	schema_name,
	table_name,
	id
GO

This can be pretty slow depending on the size and width of your tables and it could probably be updated to make it a bit more efficent but it’s certainly better than trying to do it manually.


0 Comments

Leave a Reply

Avatar placeholder

Your email address will not be published. Required fields are marked *