When importing a flat file provided by a client into a SQL table the data in the file is often a bit of a mess and it can be easier to import all the columns as long strings into a loading table and to then try and figurre out the formats and datatypes from there.
To that end below is a stored procedure for analysing the columns in a specified table. This provides min and max column lengths as well as if the column has any nulls and the number of populated and distinct values.
CREATE procedure
[dbo].[sp__inspect_table]
(
@schema_name varchar(100),
@table_name varchar(100)
) as
IF OBJECT_ID(N'tempdb..#sql_schema_table') IS NOT NULL
BEGIN
DROP TABLE #sql_schema_table
END
declare @sql varchar(max)
declare
@sql_schema_query_table table
(
id int identity(1,1),
query varchar(max)
)
insert into
@sql_schema_query_table
(
query
)
select
'insert into
#sql_schema_table
(
schema_name,
table_name,
column_name,
datatype,
min_length,
max_length,
min_value,
max_value,
total_rows,
populated_values,
distinct_values,
nullable,
examples
)
select
''[' + @schema_name + ']'' as schema_name,
''[' + @table_name + ']'' as table_name,
cast(''[' + a.name + ']'' as varchar(100)) as column_name,
cast(''[' + d.name + ']'' as varchar(100)) as datatype,
min(len([' + a.name + '])) as min_length,
max(len([' + a.name + '])) as max_length,
min([' + a.name + ']) as min_value,
max([' + a.name + ']) as max_value,
count(*) as total_rows,
sum(case when [' + a.name + '] is null then 0 else 1 end) as populated_values,
count(distinct [' + a.name + ']) as distinct_values,
cast(sum(case when [' + a.name + '] is null then 1 else 0 end) as bit) as nullable,
null as examples
from
[' + @schema_name + '].[' + @table_name + ']'
from
sys.columns a
join
sys.objects b
on
a.object_id = b.object_id
join
sys.schemas c
on
b.schema_id = c.schema_id
join
sys.types d
on
a.system_type_id = d.system_type_id
where
c.name = @schema_name
and b.name = @table_name
and b.type = 'U'--tables only
and d.name not in ('varchar', 'nvarchar', 'bit')
insert into
@sql_schema_query_table
(
query
)
select
'insert into
#sql_schema_table
(
schema_name,
table_name,
column_name,
datatype,
min_length,
max_length,
min_value,
max_value,
total_rows,
populated_values,
distinct_values,
nullable,
examples
)
select
''[' + @schema_name + ']'' as schema_name,
''[' + @table_name + ']'' as table_name,
cast(''[' + a.name + ']'' as varchar(100)) as column_name,
cast(''[' + d.name + ']'' as varchar(100)) as datatype,
min(len([' + a.name + '])) as min_length,
max(len([' + a.name + '])) as max_length,
null as min_value,
null as max_value,
count(*) as total_rows,
sum(case when [' + a.name + '] is null then 0 else 1 end) as populated_values,
count(distinct [' + a.name + ']) as distinct_values,
cast(sum(case when [' + a.name + '] is null then 1 else 0 end) as bit) as nullable,
(select top 5 ISNULL(cast([' + a.name + '] as varchar(100)), ''NULL'') as ''[' + a.name + ']'', count(*) as total from [' + @schema_name + '].[' + @table_name + '] group by [' + a.name + '] order by count(*) desc for json auto) as examples
from
[' + @schema_name + '].[' + @table_name + ']'
from
sys.columns a
join
sys.objects b
on
a.object_id = b.object_id
join
sys.schemas c
on
b.schema_id = c.schema_id
join
sys.types d
on
a.system_type_id = d.system_type_id
where
c.name = @schema_name
and b.name = @table_name
and b.type = 'U'--tables only
and d.name in ('varchar', 'nvarchar', 'bit')
declare @sql_schema_query_id int
set @sql_schema_query_id = (select min(id) from @sql_schema_query_table)
create table
#sql_schema_table
(
id int identity(1,1),
schema_name varchar(100),
table_name varchar(100),
column_name varchar(100),
datatype varchar(100),
min_length int,
max_length int,
min_value varchar(100),
max_value varchar(100),
distinct_values int,
total_rows int,
populated_values int,
nullable bit,
examples varchar(2000)
)
while (@sql_schema_query_id is not null)
BEGIN
set @sql = (select query from @sql_schema_query_table where id = @sql_schema_query_id)
exec (@sql)
delete from @sql_schema_query_table where id = @sql_schema_query_id
set @sql_schema_query_id = (select min(id) from @sql_schema_query_table)
END
select
schema_name,
table_name,
column_name,
datatype,
min_length,
max_length,
min_value,
max_value,
total_rows,
distinct_values,
(distinct_values * 1.0) / (total_rows * 1.0) as distinct_value_percentage,
populated_values,
(populated_values * 1.0) / (total_rows * 1.0) as populated_value_percentage,
nullable,
examples
from
#sql_schema_table
order by
schema_name,
table_name,
id
GO
This can be pretty slow depending on the size and width of your tables and it could probably be updated to make it a bit more efficent but it’s certainly better than trying to do it manually.
0 Comments