GDELT data into ms-sql

Go…here
download the historical backfiles

Use the following script to create a database in the required location:

CREATE TABLE GDELT_HISTORICAL (
 GLOBALEVENTID bigint , --1
 SQLDATE int, 
 MonthYear char(6) , 
 [Year] char(4) , 
 FractionDate decimal , --5
 Actor1Code char(55) , 
 Actor1Name char(255) , 
 Actor1CountryCode char(55) , 
 Actor1KnownGroupCode char(55) , 
 Actor1EthnicCode char(55) , --10
 Actor1Religion1Code char(55) , 
 Actor1Religion2Code char(55) , 
 Actor1Type1Code char(55) , 
 Actor1Type2Code char(55) , 
 Actor1Type3Code char(55) , 
 Actor2Code char(55) , --16
 Actor2Name char(255) , 
 Actor2CountryCode char(55) , 
 Actor2KnownGroupCode char(55) , 
 Actor2EthnicCode char(55) , 
 Actor2Religion1Code char(55) , 
 Actor2Religion2Code char(55) , 
 Actor2Type1Code char(55) , 
 Actor2Type2Code char(55) , 
 Actor2Type3Code char(55) , 
 IsRootEvent int , 
 EventCode char(4) , 
 EventBaseCode char(4) , 
 EventRootCode char(4) , 
 QuadClass int , 
 GoldsteinScale decimal , 
 NumMentions int , 
 NumSources int , 
 NumArticles int , 
 AvgTone decimal , 
 Actor1Geo_Type int  , 
 Actor1Geo_FullName char(255) , 
 Actor1Geo_CountryCode char(2) , 
 Actor1Geo_ADM1Code char(4) , 
 Actor1Geo_Lat float , 
 Actor1Geo_Long float , 
 Actor1Geo_FeatureID int  , 
 Actor2Geo_Type int  , 
 Actor2Geo_FullName char(255) , 
 Actor2Geo_CountryCode char(2) , 
 Actor2Geo_ADM1Code char(4) , 
 Actor2Geo_Lat float , 
 Actor2Geo_Long float , 
 Actor2Geo_FeatureID int  , 
 ActionGeo_Type int  , 
 ActionGeo_FullName char(255) , 
 ActionGeo_CountryCode char(2) , 
 ActionGeo_ADM1Code char(4) , 
 ActionGeo_Lat float , 
 ActionGeo_Long float , 
 ActionGeo_FeatureID float  , 
 DATEADDED int
);

Unzip all your history files into one location and then run this script for each file:

  BULK INSERT GDELT_HISTORICAL
    FROM 'C:\Users\MONKEYMIKE\Desktop\201302.csv'
    WITH
        (
		FIELDTERMINATOR = '\t'
		, ROWTERMINATOR = '0x0a'--'\n'
		 )

update the main table and several sub tables using the autonumber primary key from the first table…

You’ve got a table (here bed_beds with pk = bed_refno) and you want to insert a new ‘bed’ and update the bed_beds_specialties and bed_sex_status tables using the newly created auto-generated id key.

Create a Stored Procedure (SP) and use the the SCOPE_IDENTITY() function to keep a record of the identity key generated within the scope. Then pass that to the INSERT queries used further down in the SP.


CREATE PROCEDURE spInsert_Bed
@wardRefno numeric(10,0),
@typeRefno numeric(10,0),
@dStart DATETIME,
@dEnd DATETIME,
@dCreate DATETIME,
@vUser VARCHAR(15)
AS
SET NOCOUNT ON
declare @lastin numeric(10,0)
INSERT INTO bed_beds (ward_refno, type_refno, start_Dttm, end_Dttm,create_Dttm,user_create)
VALUES (@wardRefno, @typeRefno, @dStart, @dEnd,@dCreate,@vUser)
set @lastin=  SCOPE_IDENTITY()

INSERT INTO bed_beds_specialties (bed_refno, spect_refno,comments, start_Dttm, end_Dttm,create_Dttm,user_create)
VALUES (@lastin, 3,'my sp test', @dStart, @dEnd,getdate(),@vUser)

INSERT INTO bed_sex_status (bed_refno, start_Dttm, end_Dttm,sex_rfval_refno,create_Dttm,user_create)
VALUES (@lastin,  @dStart,@dEnd,6,getdate(),@vUser)

GO

Calculate Percentiles with MS-SQL 2005 using interpolation…

You’ll need MS SQL 2005 or above, it looks like version 2012 has it built in see here!
This is my first use of the OVER bit so it may be a little ugly, also I have been using the old way to pivot stuff so it’s a bit messy at the end.

declare @t table (percentile numeric(10,5))
insert into @t values (0)
insert into @t values (0.25)
insert into @t values (0.5)
insert into @t values (0.75)
insert into @t values (0.8)
insert into @t values (1.0)

--give rownumbers for your data over a partition
drop table ##mgw
SELECT 
	s.gppraccode
	,age
	,ROW_NUMBER() OVER (partition by gppraccode ORDER BY age) AS "RowNumber"
INTO ##mgw
FROM mgw_ane_base AS s 

--calculate the individual values per percentile
drop table ##mgw_percentileHolding
select	t.percentile*(max(rownumber)-1)+1 as act
		,floor(t.percentile*(max(rownumber)-1)+1) as K
		,((t.percentile*(max(rownumber)-1)+1) % 1) as D
		,count(*) as N
		,t.percentile
		,a.gppraccode
into ##mgw_percentileHolding
from ##mgw as a join @t as t on 1=1
group by t.percentile,a.gppraccode

--fold it all over in a manual-ish pivot
select 
	gppraccode
	,max(case when percentile=0 then PercentileValue end) as [0%]
	,max(case when percentile=0.25 then PercentileValue end) as [25%]
	,max(case when percentile=0.5 then PercentileValue end) as [50%]
	,max(case when percentile=0.75 then PercentileValue end) as [75%]
	,max(case when percentile=0.8 then PercentileValue end) as [80%]
	,max(case when percentile=1 then PercentileValue end) as [100%]
	,N
FROM
	(
	SELECT 
	*
	,(SELECT max(age) from ##mgw as b where b.rownumber<=a.act and a.gppraccode=b.gppraccode) as below
	,(SELECT min(age) from ##mgw as b where b.rownumber>=a.act  and a.gppraccode=b.gppraccode) as above
	,(SELECT min(age) from ##mgw as b where b.rownumber>=a.act  and a.gppraccode=b.gppraccode)-(select max(age) from ##mgw as b where b.rownumber<=a.act and a.gppraccode=b.gppraccode ) as f
	,case	when percentile=0 then (SELECT min(age) from ##mgw as b where a.gppraccode=b.gppraccode) 
			when percentile=1 then (SELECT max(age) from ##mgw as b where a.gppraccode=b.gppraccode)  
			else (SELECT max(age) from ##mgw as b where b.rownumber<=a.act and a.gppraccode=b.gppraccode)+D*((select min(age) from ##mgw as b where b.rownumber>=a.act and a.gppraccode=b.gppraccode)-(select max(age) from ##mgw as b where b.rownumber<=a.act and a.gppraccode=b.gppraccode)) end as PercentileValue
	FROM ##mgw_percentileHolding as a
) as internal
group by gppraccode,N

Row_number(), dense Rank rank etc

You need SQL 2005 or greater for this:
Rownumbers with partitions:

SELECT 
    s.gppraccode
    ,age
    ,ROW_NUMBER()OVER (PARTITION BY gppraccode ORDER BY age asc) As [RowNumber]
FROM mgw_ane_base AS s 
SELECT s.gppraccode, age
    ,ROW_NUMBER() OVER (ORDER BY age) AS "RowNumber"
    ,RANK() OVER (ORDER BY age) AS Rank
    ,DENSE_RANK() OVER (ORDER BY age) AS "DenseRank"
    ,NTILE(10) OVER (ORDER BY age) AS Quartile
INTO ##mgw
FROM mgw_ane_base AS s 

Rank & Miscellany

Rank things in the table…..very usefull for more complex stuff

SELECT
    --your select bits here
,(Select count(*)	FROM YOUR_TABLE_NAME as internal
WHERE internal.refrl_refno=external.refrl_refno
AND internal.start_dttm<external.start_dttm
--be sure to have the same criteria applied to the internal as to the external versions of the table
) as drank

Age groups…
Slot these into a CASE END Sandwich

When age between  0 and  4 then '00-04'
When age between  5 and  9 then '05-09'
When age between 10 and 14 then '10-14'
When age between 15 and 19 then '15-19'
When age between 20 and 24 then '20-24'
When age between 25 and 29 then '25-29'
When age between 30 and 34 then '30-34'
When age between 35 and 39 then '35-39'
When age between 40 and 44 then '40-44'
When age between 45 and 49 then '45-49'
When age between 50 and 54 then '50-54'
When age between 55 and 59 then '55-59'
When age between 60 and 64 then '60-64'
When age between 65 and 69 then '65-69'
When age between 70 and 74 then '70-74'
When age between 75 and 79 then '75-79'
When age between 80 and 84 then '80-84'
When age between 85 and 89 then '85-89'
When age between 90 and 94 then '90-94'
When age between 95 and 99 then '95-99'

Create a table variable…

DECLARE @sw  table
(
field_refno numeric
, var_field varchar(10)
)
INSERT INTO @sw VALUES(1,'Hudgle')

Using HAVING

SELECT
	count(prvsp_refno) as N
	, patnt_refno
FROM 	provider_spells
WHERE
	inmgt_refno=200870
	and admit_dttm > '2008-01-01'
GROUP BY patnt_refno
HAVING
	count(prvsp_refno)>10

SQL time-ish stuff

Extract the time:

--get the time as an hour
DATEPART(hh,getdate())
Extract the dayname:
--get the day as a word (short)
UPPER(left(datename(dw,getdate()),3))
--get something nicely formatted.
substring(convert(char,getdate() ,120),12,5)

Create a table full of sequential date_time

USE tempdb
DECLARE @t int
DECLARE @inc_date datetime 
SET @inc_date = '2001-01-01 23:59:59'  
--CHANGE THIS TIME TO GET DIFFERENT SNAPSHOT TIME first date and time of temp table 

--if the temp table exists drop and recreate with the date time supplied above	
IF  (SELECT '1' as exist_ WHERE exists (select '1' from sysobjects  WHERE  name='##Temp_dates')) = '1' 
	BEGIN DROP TABLE  ##Temp_dates END
--create the table
CREATE TABLE ##Temp_dates(t_date datetime NOT NULL PRIMARY KEY) 
	SET NOCOUNT ON  
WHILE @inc_date <= '2012-12-31' 
	BEGIN 
	INSERT ##Temp_dates (t_date) VALUES (@inc_date) 
	SET @inc_date = @inc_date + 1 --or decimal value for hours 0.5 etc..
	END

SQL Date Scriptlets….

Convert calendar month number to financial year month number i.e. April = 1 rather than 4
In the following DTTM = Your date field

(MONTH([DTTM])+8)%12+1

Obtain Financial year from a date.

CASE WHEN MONTH(DTTM) IN (1,2,3) THEN CONVERT(CHAR(4),YEAR(DTTM)-1)+'/'+CONVERT(CHAR(4),(YEAR(DTTM))) 
ELSE CONVERT(CHAR(4),YEAR(DTTM)) +'/'+CONVERT(CHAR(4),YEAR(DTTM)+1) END as fyear

Some Automatic Dates:

--first day of current month
DATEADD(MM, DATEDIFF(M, 0, GETDATE()), 0)  

--first day of previous months
DATEADD(MM, DATEDIFF(M, 0, GETDATE())-1 , 0)  

--last day of previous month
DATEADD(ms,-2,DATEADD(MM, DATEDIFF(M, 0, GETDATE()) , 0))  

--tomorrow evening
DATEADD(ms,-2,DATEADD(dd, DATEDIFF(d, 0, GETDATE())+2 , 0))  

…or swap the getdate() to a field to get the appropriate conversion from a field.

Select the first day of the current financial year…

case when month(DTTM) between 4 and 12 
then DATEADD(MM, DATEDIFF(M, 0, DTTM)-(month(DTTM)-4) , 0) 
else   DATEADD(MM, DATEDIFF(M, 0, DTTM)-(month(DTTM)+8) , 0) end