- Filter the speedtest conducted in Barcelona or Madrid. Then list the internet providers working in those cities.
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as ( client_address: chararray, client_country: chararray, lon: float, lat: float, client_provider: chararray, mlabservername: chararray, connect_time: float, download_speed: float, neubot_version: float, platform: chararray, remote_address: chararray, test_name: chararray, timestamp: long, upload_speed: float, latency: float, uuid: chararray, asnum: chararray, region: chararray, city: chararray, hour: int, month: int, year: int, weekday: int, day: int, filedate: chararray ); -- -- A1: Internet Providers in 'Barcelona' or 'Madrid' where speedtests were conducted -- SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*'); SpeedTests = FILTER @ BY ( city matches '.*Barcelona.*' OR city matches '.*Madrid.*' ); Providers = FOREACH @ GENERATE city, client_provider; Providers = DISTINCT @; DUMP @;
- List the names and the IP ranges of the internet providers located in Barcelona. For this you need to use the IPtoNumber user defined function (cf. NeubotTestsUDFs.jar).
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as ( client_address: chararray, client_country: chararray, lon: float, lat: float, client_provider: chararray, mlabservername: chararray, connect_time: float, download_speed: float, neubot_version: float, platform: chararray, remote_address: chararray, test_name: chararray, timestamp: long, upload_speed: float, latency: float, uuid: chararray, asnum: chararray, region: chararray, city: chararray, hour: int, month: int, year: int, weekday: int, day: int, filedate: chararray ); -- -- A2: Internet Providers in Barcelona and their IP range based on the speedtests observations -- SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*'); SpeedTests = FILTER @ BY ( city matches '.*Barcelona.*' ); Providers = FOREACH @ GENERATE city, client_provider, IPtoNumber(client_address) AS ip ; Providers = GROUP @ BY client_provider; Providers_IP_Range = FOREACH @ GENERATE group, NumberToIP( MIN(Providers.ip) ), NumberToIP( MAX(Providers.ip) ) ; DUMP @;
- Group the speedtest based on the user network infrastructure (e.g., 3G/4G vs ADSL). For this you can assume some max bandwidth (e.g., 21Mb/sec for ADSL).
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as ( client_address: chararray, client_country: chararray, lon: float, lat: float, client_provider: chararray, mlabservername: chararray, connect_time: float, download_speed: float, neubot_version: float, platform: chararray, remote_address: chararray, test_name: chararray, timestamp: long, upload_speed: float, latency: float, uuid: chararray, asnum: chararray, region: chararray, city: chararray, hour: int, month: int, year: int, weekday: int, day: int, filedate: chararray ); -- -- A3: Speedtests (conducted in Barcelona) organized by network type: Mobile vs ADSL -- SpeedTests = FILTER @ BY (test_name matches '.*speedtest.*'); SpeedTests = FILTER @ BY ( city matches '.*Barcelona.*' ); SPLIT @ INTO Mobile_Tests IF ( download_speed > 21000000 -- 21 Mb / sec ), ADSL_Tests IF ( download_speed <= 21000000 -- 21 Mb / sec ) ; MobileSpeeds = FOREACH Mobile_Tests GENERATE CEIL(download_speed / 1000000) AS download_speed, 'mobile' AS network_type: chararray ; ADSLSpeeds = FOREACH ADSL_Tests GENERATE CEIL(download_speed / 1000000) AS download_speed, 'adsl' AS network_type: chararray ; Speeds = UNION MobileSpeeds, ADSLSpeeds; Speeds = GROUP @ BY (download_speed, network_type); Speeds = FOREACH @ GENERATE CONCAT( (chararray) group.download_speed, ' mb/sec' ), group.network_type, COUNT(Speeds) ; DUMP @;
- Find the user that realized the maximum number of tests. For this user, produce a table showing the evolution of her/his download/upload speeds.
NeubotTests = LOAD 'NeubotTests' using PigStorage(';') as ( client_address: chararray, client_country: chararray, lon: float, lat: float, client_provider: chararray, mlabservername: chararray, connect_time: float, download_speed: float, neubot_version: float, platform: chararray, remote_address: chararray, test_name: chararray, timestamp: long, upload_speed: float, latency: float, uuid: chararray, asnum: chararray, region: chararray, city: chararray, hour: int, month: int, year: int, weekday: int, day: int, filedate: chararray ); -- -- Determines the user that realized the maximum number of tests and -- obtain his/her download_speed log -- Tests = FILTER @ BY (test_name matches '.*speedtest.*'); Tests_In_Barcelona = FILTER @ BY ( city matches '.*Barcelona.*' ); Tests_Per_User = GROUP Tests_In_Barcelona BY uuid; Tests_Per_User = FOREACH @ GENERATE group AS uuid, COUNT(Tests_In_Barcelona) AS numberOfTests ; MAX_NUM_TESTS = GROUP @ ALL; MAX_NUM_TESTS = FOREACH @ GENERATE MAX( Tests_Per_User.numberOfTests) AS numberOfTests ; TOP_1_USER = JOIN Tests_Per_User BY numberOfTests, MAX_NUM_TESTS BY numberOfTests ; TOP_1_USER = FOREACH @ GENERATE Tests_Per_User::uuid AS uuid; TOP_1_USER_TESTS = JOIN Tests BY uuid, TOP_1_USER BY uuid ; TOP_1_USER_TESTS = FOREACH @ GENERATE Tests::uuid AS uuid, Tests::city AS city, Tests::timestamp AS timestamp, Tests::download_speed AS download_speed ; TOP_1_USER_TESTS = ORDER @ BY timestamp; STORE @ INTO 'Top_1_User' USING PigStorage(',');